From 6adb2902107c29ba8770e20a5e8a5a1dd0f750c0 Mon Sep 17 00:00:00 2001
From: reyna-abhyankar <forvirenra@gmail.com>
Date: Sat, 24 Aug 2024 23:43:58 -0700
Subject: [PATCH 01/91] temporary weight adjust index

---
 lib/local-execution/src/local_slots_backing.cc     | 14 +++++++++++---
 .../test/src/test_local_slots_backing.cc           |  2 +-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc
index 0ec9068c6a..c8d186a0fe 100644
--- a/lib/local-execution/src/local_slots_backing.cc
+++ b/lib/local-execution/src/local_slots_backing.cc
@@ -76,13 +76,22 @@ GenericTensorAccessorW const &
 TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing(
     OpTaskBinding const &binding, layer_guid_t const &op_guid) const {
   TensorSlotsBacking mapping;
+  int num_inputs = 0;
+  for (auto const &tensor_binding : binding.get_tensor_bindings()) {
+    if (tensor_binding.first.is_grad == IsGrad::NO && tensor_binding.second.role == TensorRole::INPUT) {
+      num_inputs += 1;
+    }
+  }
+
   for (auto const &tensor_binding : binding.get_tensor_bindings()) {
     SlotGradId slot_grad_id = tensor_binding.first;
     OpTensorSpec tensor_spec = tensor_binding.second;
     std::vector<tensor_guid_t> tensor_guids;
+    int weight_adjusted_idx = 0;
     switch (tensor_spec.role) {
-      case TensorRole::INPUT:
       case TensorRole::WEIGHT:
+	weight_adjusted_idx = num_inputs;
+      case TensorRole::INPUT:
         assert(contains_key(this->input_tensor_slots, op_guid));
         tensor_guids = this->input_tensor_slots.at(op_guid);
         break;
@@ -96,10 +105,9 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing(
                                                 // "type_is_unformattable" error
     }
 
-    assert(tensor_guids.size() > tensor_spec.idx);
     IsGrad is_grad = slot_grad_id.is_grad;
     GenericTensorAccessorW tensor_backing =
-        this->get_tensor_backing(tensor_guids.at(tensor_spec.idx), is_grad);
+        this->get_tensor_backing(tensor_guids.at(weight_adjusted_idx + tensor_spec.idx), is_grad);
 
     mapping.insert({slot_grad_id, tensor_backing});
   }
diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc
index 542aa66087..e31e7cf2b4 100644
--- a/lib/local-execution/test/src/test_local_slots_backing.cc
+++ b/lib/local-execution/test/src/test_local_slots_backing.cc
@@ -188,7 +188,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         b.bind(QUERY, input_tensor(0));
         b.bind(KEY, input_tensor(1));
         b.bind(VALUE, input_tensor(2));
-        b.bind(WEIGHTS, weight_tensor(3));
+        b.bind(WEIGHTS, weight_tensor(0));
         b.bind(OUTPUT, output_tensor(0));
 
         b.bind_grad(QUERY, input_tensor(0));

From 61697c2a30338ae39fa10ef35899f519c8d2e514 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Tue, 27 Aug 2024 01:45:52 -0700
Subject: [PATCH 02/91] Loss function

---
 lib/kernels/CMakeLists.txt                    |  1 +
 .../include/kernels/optimizer_kernels.h       |  6 +-
 ...timizer_kernel.cu => optimizer_kernels.cu} |  1 +
 .../generic_task_impl_function.h              | 33 +++++++
 .../local-execution/local_slots_backing.h     | 11 ++-
 .../local-execution/local_training_backing.h  | 11 ++-
 .../include/local-execution}/loss_functions.h | 22 ++---
 .../model_training_instance.struct.toml       | 26 +++++
 .../task_arg_spec.variant.toml                | 18 ++++
 .../task_impl_function.variant.toml           |  5 +
 .../include/local-execution/task_invocation.h | 71 ++++++++++++++
 .../include/local-execution/task_signature.h  | 57 +++++++++++
 .../task_signature.struct.toml                | 29 ++++++
 .../tensor_guid_slot_spec.struct.toml         | 27 ++++++
 .../tensor_guid_spec.struct.toml              | 22 +++++
 .../src/generic_task_impl_function.cc         | 53 ++++++++++
 .../src/local_cost_estimator.cc               |  3 +-
 .../src/local_slots_backing.cc                | 51 +++++++++-
 .../src/local_training_backing.cc             | 50 +++++++++-
 .../src/loss_functions.cc                     | 96 ++++++++-----------
 lib/local-execution/src/ops/attention.cc      |  2 +-
 .../local-execution => src}/ops/attention.h   |  0
 lib/local-execution/src/task_invocation.cc    | 49 ++++++++++
 lib/local-execution/src/task_signature.cc     | 25 +++++
 .../src/task_signature_impl.cc                |  2 +-
 .../test/src/test_task_registry.cc            |  1 -
 .../op-attrs/ops/loss_attrs.variant.toml      | 22 +++++
 .../op-attrs/ops/loss_function.enum.toml      | 23 +++++
 .../include/op-attrs/ops/loss_functions.h     | 68 +------------
 .../op-attrs/ops/other_loss_attrs.struct.toml | 18 ++++
 ...arse_categorical_ce_loss_attrs.struct.toml | 14 +++
 lib/op-attrs/src/loss_functions.cc            | 25 ++---
 32 files changed, 671 insertions(+), 171 deletions(-)
 rename lib/kernels/src/cuda/{optimizer_kernel.cu => optimizer_kernels.cu} (99%)
 create mode 100644 lib/local-execution/include/local-execution/generic_task_impl_function.h
 rename lib/{runtime/src => local-execution/include/local-execution}/loss_functions.h (63%)
 create mode 100644 lib/local-execution/include/local-execution/model_training_instance.struct.toml
 create mode 100644 lib/local-execution/include/local-execution/task_arg_spec.variant.toml
 create mode 100644 lib/local-execution/include/local-execution/task_invocation.h
 create mode 100644 lib/local-execution/include/local-execution/task_signature.h
 create mode 100644 lib/local-execution/include/local-execution/task_signature.struct.toml
 create mode 100644 lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml
 create mode 100644 lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml
 create mode 100644 lib/local-execution/src/generic_task_impl_function.cc
 rename lib/{runtime => local-execution}/src/loss_functions.cc (63%)
 rename lib/local-execution/{include/local-execution => src}/ops/attention.h (100%)
 create mode 100644 lib/local-execution/src/task_invocation.cc
 create mode 100644 lib/local-execution/src/task_signature.cc
 create mode 100644 lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml
 create mode 100644 lib/op-attrs/include/op-attrs/ops/loss_function.enum.toml
 create mode 100644 lib/op-attrs/include/op-attrs/ops/other_loss_attrs.struct.toml
 create mode 100644 lib/op-attrs/include/op-attrs/ops/sparse_categorical_ce_loss_attrs.struct.toml

diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt
index 8ccd7c1011..54fa3c9583 100644
--- a/lib/kernels/CMakeLists.txt
+++ b/lib/kernels/CMakeLists.txt
@@ -8,6 +8,7 @@ file(GLOB_RECURSE SRC
      LIST_DIRECTORIES False
      src/*.cc
      src/cuda/cuda_helper.cu
+     src/cuda/loss_functions_kernels.cu
      src/cuda/ops/*.cu
      )
 
diff --git a/lib/kernels/include/kernels/optimizer_kernels.h b/lib/kernels/include/kernels/optimizer_kernels.h
index 9ca6bf8e2b..fcbf9454f8 100644
--- a/lib/kernels/include/kernels/optimizer_kernels.h
+++ b/lib/kernels/include/kernels/optimizer_kernels.h
@@ -1,7 +1,8 @@
 #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H
 
-#include "device.h"
+#include "kernels/device.h"
+#include "kernels/ff_handle.h"
 
 namespace FlexFlow {
 
@@ -20,7 +21,8 @@ void sgd_nccl_update_task_gpu(ffStream_t,
                               float lr,
                               float momentum,
                               bool nesterov,
-                              float weight_decay PerDeviceFFHandle const &,
+                              float weight_decay,
+                              PerDeviceFFHandle const &,
                               float const *weight_grad_ptr,
                               size_t size,
                               float *weight_ptr,
diff --git a/lib/kernels/src/cuda/optimizer_kernel.cu b/lib/kernels/src/cuda/optimizer_kernels.cu
similarity index 99%
rename from lib/kernels/src/cuda/optimizer_kernel.cu
rename to lib/kernels/src/cuda/optimizer_kernels.cu
index 439eed9dec..1bb38b2870 100644
--- a/lib/kernels/src/cuda/optimizer_kernel.cu
+++ b/lib/kernels/src/cuda/optimizer_kernels.cu
@@ -13,6 +13,7 @@
  * limitations under the License.
  */
 
+#include "device.h"
 #include "kernels/optimizer_kernels.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/include/local-execution/generic_task_impl_function.h b/lib/local-execution/include/local-execution/generic_task_impl_function.h
new file mode 100644
index 0000000000..425740f61d
--- /dev/null
+++ b/lib/local-execution/include/local-execution/generic_task_impl_function.h
@@ -0,0 +1,33 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_GENERIC_TASK_IMPL_FUNCTION_H
+#define _FLEXFLOW_LOCAL_EXECUTION_GENERIC_TASK_IMPL_FUNCTION_H
+
+#include "local-execution/device_specific_device_states.dtg.h"
+#include "local-execution/task_argument_accessor.h"
+
+namespace FlexFlow {
+
+struct GenericTaskImplFunction {
+
+  void (*function_ptr)(TaskArgumentAccessor const &);
+
+  bool operator==(GenericTaskImplFunction const &) const;
+  bool operator!=(GenericTaskImplFunction const &) const;
+  bool operator<(GenericTaskImplFunction const &) const;
+  bool operator>(GenericTaskImplFunction const &) const;
+  bool operator<=(GenericTaskImplFunction const &) const;
+  bool operator>=(GenericTaskImplFunction const &) const;
+};
+
+std::string format_as(GenericTaskImplFunction const &x);
+std::ostream &operator<<(std::ostream &s, GenericTaskImplFunction const &x);
+
+} // namespace FlexFlow
+
+namespace std {
+template <>
+struct hash<::FlexFlow::GenericTaskImplFunction> {
+  size_t operator()(::FlexFlow::GenericTaskImplFunction const &) const;
+};
+} // namespace std
+
+#endif
diff --git a/lib/local-execution/include/local-execution/local_slots_backing.h b/lib/local-execution/include/local-execution/local_slots_backing.h
index 6a0c28e988..312a13cc01 100644
--- a/lib/local-execution/include/local-execution/local_slots_backing.h
+++ b/lib/local-execution/include/local-execution/local_slots_backing.h
@@ -7,6 +7,7 @@
 #include "local-execution/op_task_invocation.h"
 #include "local-execution/per_device_op_state.h"
 #include "local-execution/runtime_arg_config.h"
+#include "local-execution/task_invocation.h"
 
 namespace FlexFlow {
 
@@ -19,23 +20,29 @@ struct LocalSlotsBacking {
 public:
   void add_per_device_op_state(layer_guid_t const &,
                                DeviceSpecificDeviceStates const &);
+  void allocate_label_tensor(tensor_guid_t const &,
+                             ComputationGraph const &,
+                             Allocator &);
   void allocate_outgoing_tensors(layer_guid_t const &,
                                  ComputationGraph const &,
                                  Allocator &);
   TensorSlotsBacking construct_tensor_slots_backing(OpTaskBinding const &,
                                                     layer_guid_t const &) const;
+  TensorSlotsBacking construct_tensor_slots_backing(TaskBinding const &) const;
   ArgSlotsBacking construct_arg_slots_backing(OpTaskBinding const &,
                                               layer_guid_t const &) const;
+  ArgSlotsBacking construct_arg_slots_backing(TaskBinding const &) const;
 
   ConcreteArgSpec resolve_runtime_arg_ref_spec(RuntimeArgRefSpec const &) const;
   ConcreteArgSpec resolve_op_arg_ref_spec(OpArgRefSpec const &,
                                           layer_guid_t const &) const;
 
+  GenericTensorAccessorW const &get_tensor_backing(tensor_guid_t const &,
+                                                   IsGrad) const;
+
 private:
   bool is_tensor_allocated(tensor_guid_t const &) const;
   bool is_gradient_tensor_allocated(tensor_guid_t const &) const;
-  GenericTensorAccessorW const &get_tensor_backing(tensor_guid_t const &,
-                                                   IsGrad) const;
 
 public:
   // tensors
diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h
index b398bb8cc3..55983086c2 100644
--- a/lib/local-execution/include/local-execution/local_training_backing.h
+++ b/lib/local-execution/include/local-execution/local_training_backing.h
@@ -2,7 +2,9 @@
 #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H
 
 #include "local-execution/local_slots_backing.h"
+#include "local-execution/model_training_instance.dtg.h"
 #include "local-execution/task_registry.h"
+#include "op-attrs/ops/loss_functions.h"
 
 namespace FlexFlow {
 
@@ -13,15 +15,17 @@ struct LocalTrainingBacking {
   LocalTrainingBacking(Allocator const &,
                        ComputationGraph const &,
                        TensorBackingMap const &,
-                       RuntimeArgConfig const &);
+                       RuntimeArgConfig const &,
+                       std::optional<ModelTrainingInstance> const &);
 
   void execute_init();
   PerLayerElapsedTime execute_forward();
   PerLayerElapsedTime execute_backward();
   void execute_update();
 
-  TaskArgumentAccessor get_task_arg_accessor(OpTaskInvocation const &,
-                                             layer_guid_t const &) const;
+  TaskArgumentAccessor get_task_arg_accessor(TaskInvocation const &) const;
+  TaskArgumentAccessor get_op_task_arg_accessor(OpTaskInvocation const &,
+                                                layer_guid_t const &) const;
 
 private:
   DeviceSpecificDeviceStates call_init_task_impl(task_id_t,
@@ -33,6 +37,7 @@ struct LocalTrainingBacking {
   ComputationGraph computation_graph;
   TaskRegistry task_registry;
   LocalSlotsBacking local_slots_backing;
+  std::optional<ModelTrainingInstance> training_instance;
 };
 
 } // namespace FlexFlow
diff --git a/lib/runtime/src/loss_functions.h b/lib/local-execution/include/local-execution/loss_functions.h
similarity index 63%
rename from lib/runtime/src/loss_functions.h
rename to lib/local-execution/include/local-execution/loss_functions.h
index 620ebc6936..e5e81b60a7 100644
--- a/lib/runtime/src/loss_functions.h
+++ b/lib/local-execution/include/local-execution/loss_functions.h
@@ -13,24 +13,20 @@
  * limitations under the License.
  */
 
-#ifndef _FF_LOSS_FUNCTIONS_H_
-#define _FF_LOSS_FUNCTIONS_H_
+#ifndef _FLEXFLOW_LOSS_FUNCTIONS_H_
+#define _FLEXFLOW_LOSS_FUNCTIONS_H_
 
+#include "local-execution/task_impl_function.dtg.h"
+#include "local-execution/task_invocation.h"
+#include "local-execution/task_signature.h"
 #include "op-attrs/ops/loss_functions.h"
-#include "pcg/operator.h"
-#include "pcg/parallel_tensor.h"
-#include "pcg/parallel_tensor_guid_t.h"
-#include "task_spec/task_invocation.h"
-#include "tasks.h"
 
 namespace FlexFlow {
 
-template <>
-void register_task<LOSS_BWD_TASK_ID>();
-
-TaskInvocation backward(LossAttrs const &,
-                        parallel_tensor_guid_t logit,
-                        parallel_tensor_guid_t label);
+TaskImplFunction get_loss_bwd_task_impl();
+TaskSignature get_loss_bwd_signature();
+TaskInvocation
+    backward(LossAttrs const &, tensor_guid_t logit, tensor_guid_t label);
 
 } // namespace FlexFlow
 
diff --git a/lib/local-execution/include/local-execution/model_training_instance.struct.toml b/lib/local-execution/include/local-execution/model_training_instance.struct.toml
new file mode 100644
index 0000000000..ea7e8d24ab
--- /dev/null
+++ b/lib/local-execution/include/local-execution/model_training_instance.struct.toml
@@ -0,0 +1,26 @@
+namespace = "FlexFlow"
+name = "ModelTrainingInstance"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "utils/optional.h",
+  "op-attrs/ops/loss_attrs.dtg.h",
+  "pcg/tensor_guid_t.dtg.h",
+]
+
+[[fields]]
+name = "loss_attrs"
+type = "::FlexFlow::LossAttrs"
+
+[[fields]]
+name = "label_tensor"
+type = "::FlexFlow::tensor_guid_t"
+
+[[fields]]
+name = "logit_tensor"
+type = "::FlexFlow::tensor_guid_t"
diff --git a/lib/local-execution/include/local-execution/task_arg_spec.variant.toml b/lib/local-execution/include/local-execution/task_arg_spec.variant.toml
new file mode 100644
index 0000000000..a6df0c8a7d
--- /dev/null
+++ b/lib/local-execution/include/local-execution/task_arg_spec.variant.toml
@@ -0,0 +1,18 @@
+namespace = "FlexFlow"
+name = "TaskArgSpec"
+features = [
+  "eq"
+]
+
+includes = [
+  "local-execution/concrete_arg.h",
+  "local-execution/runtime_arg_ref.h"
+]
+
+[[values]]
+type = "::FlexFlow::ConcreteArgSpec"
+key = "concrete_arg_spec"
+
+[[values]]
+type = "::FlexFlow::RuntimeArgRefSpec"
+key = "runtime_arg_ref"
diff --git a/lib/local-execution/include/local-execution/task_impl_function.variant.toml b/lib/local-execution/include/local-execution/task_impl_function.variant.toml
index a12be37da2..1be18bebfa 100644
--- a/lib/local-execution/include/local-execution/task_impl_function.variant.toml
+++ b/lib/local-execution/include/local-execution/task_impl_function.variant.toml
@@ -10,6 +10,7 @@ features = [
 includes = [
   "local-execution/init_task_impl_function.h",
   "local-execution/fwd_bwd_task_impl_function.h",
+  "local-execution/generic_task_impl_function.h",
 ]
 
 [[values]]
@@ -19,3 +20,7 @@ key = "init_task_impl_function"
 [[values]]
 type = "::FlexFlow::FwdBwdTaskImplFunction"
 key = "fwd_bwd_task_impl_function"
+
+[[values]]
+type = "::FlexFlow::GenericTaskImplFunction"
+key = "generic_task_impl_function"
diff --git a/lib/local-execution/include/local-execution/task_invocation.h b/lib/local-execution/include/local-execution/task_invocation.h
new file mode 100644
index 0000000000..2317c65c02
--- /dev/null
+++ b/lib/local-execution/include/local-execution/task_invocation.h
@@ -0,0 +1,71 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_INVOCATION_H
+#define _FLEXFLOW_LOCAL_EXECUTION_TASK_INVOCATION_H
+
+#include "local-execution/slot_grad_id.dtg.h"
+#include "local-execution/slot_id_t.dtg.h"
+#include "local-execution/task_arg_spec.dtg.h"
+#include "local-execution/task_id_t.dtg.h"
+#include "local-execution/task_signature.dtg.h"
+#include "local-execution/tensor_guid_spec.dtg.h"
+
+namespace FlexFlow {
+
+struct TaskBinding {
+  TaskBinding() = default;
+
+  void bind(int, TensorGuidSpec const &);
+  void bind(slot_id_t, TensorGuidSpec const &);
+
+  template <typename T>
+  void bind_arg(int name, T const &t) {
+    this->bind_arg<T>(slot_id_t{name}, t);
+  }
+
+  template <typename T>
+  void bind_arg(slot_id_t name, T const &t) {
+    this->insert_arg_spec(name, TaskArgSpec{ConcreteArgSpec::create(t)});
+  }
+
+  template <typename T>
+  void bind_arg(int name, RuntimeArgRef<T> const &t) {
+    this->bind_arg<T>(slot_id_t{name}, t);
+  }
+
+  template <typename T>
+  void bind_arg(slot_id_t name, RuntimeArgRef<T> const &ref) {
+    this->insert_arg_spec(name, TaskArgSpec{RuntimeArgRefSpec::create(ref)});
+  }
+
+  bool operator==(TaskBinding const &other) const;
+  bool operator!=(TaskBinding const &other) const;
+
+  std::unordered_map<SlotGradId, TensorGuidSpec> const &
+      get_tensor_bindings() const;
+  std::unordered_map<slot_id_t, TaskArgSpec> const &get_arg_bindings() const;
+
+private:
+  std::unordered_map<SlotGradId, TensorGuidSpec> tensor_bindings;
+  std::unordered_map<slot_id_t, TaskArgSpec> arg_bindings;
+
+private:
+  void insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec);
+  std::tuple<decltype(tensor_bindings) const &, decltype(arg_bindings) const &>
+      tie() const;
+};
+
+struct TaskInvocation {
+public:
+  TaskInvocation() = delete;
+  TaskInvocation(task_id_t task_id, TaskBinding const &binding)
+      : task_id(task_id), binding(binding) {}
+
+public:
+  task_id_t task_id;
+  TaskBinding binding;
+};
+
+bool is_invocation_valid(TaskSignature const &sig, TaskInvocation const &inv);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/local-execution/include/local-execution/task_signature.h b/lib/local-execution/include/local-execution/task_signature.h
new file mode 100644
index 0000000000..d31a67e027
--- /dev/null
+++ b/lib/local-execution/include/local-execution/task_signature.h
@@ -0,0 +1,57 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_H
+#define _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_H
+
+// #include "local-execution/tensor_guid_slot_spec.dtg.h"
+// #include "local-execution/serialization.h"
+// #include "utils/hash/unordered_map.h"
+// #include "utils/hash/unordered_set.h"
+// #include "utils/type_index.h"
+
+#include "local-execution/task_signature.dtg.h"
+
+namespace FlexFlow {
+
+TaskSignature make_empty_task_signature();
+
+void add_slot(TaskSignature &,
+              int name,
+              IsGrad,
+              SlotType slot_type = SlotType::TENSOR);
+void add_slot(TaskSignature &,
+              slot_id_t name,
+              IsGrad,
+              SlotType slot_type = SlotType::TENSOR);
+
+template <typename T>
+void add_arg_slot(TaskSignature &task_signature, int name) {
+  add_arg_slot<T>(task_signature, slot_id_t{name});
+}
+
+template <typename T>
+void add_arg_slot(TaskSignature &task_signature, slot_id_t name) {
+  // static_assert(is_serializable<T>::value, "Type must be serializable");
+  task_signature.task_arg_types.insert({name, get_type_index_for_type<T>()});
+}
+
+template <typename T>
+void add_return_value(TaskSignature &task_signature) {
+  task_signature.return_value = get_type_index_for_type<T>();
+}
+
+// adds arg_slot without checking is_serializable, used for arguments that are
+// deviceSpecific
+template <typename T>
+void add_unchecked_arg_slot(TaskSignature &task_signature, int name) {
+  add_unchecked_arg_slot<T>(task_signature, slot_id_t{name});
+}
+
+// adds arg_slot without checking is_serializable, used for arguments that are
+// deviceSpecific
+template <typename T>
+void add_unchecked_arg_slot(TaskSignature &task_signature, slot_id_t name) {
+  task_signature.task_arg_types.insert({name, get_type_index_for_type<T>()});
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/local-execution/include/local-execution/task_signature.struct.toml b/lib/local-execution/include/local-execution/task_signature.struct.toml
new file mode 100644
index 0000000000..f86f7b0c57
--- /dev/null
+++ b/lib/local-execution/include/local-execution/task_signature.struct.toml
@@ -0,0 +1,29 @@
+namespace = "FlexFlow"
+name = "TaskSignature"
+features = [
+  "eq",
+  "fmt",
+]
+
+includes = [
+  "local-execution/tensor_guid_slot_spec.dtg.h",
+  "utils/type_index.h",
+  "utils/optional.h"
+]
+
+src_includes = [
+  "utils/fmt/unordered_map.h",
+  "utils/fmt/unordered_set.h",
+]
+
+[[fields]]
+name = "return_value"
+type = "std::optional<std::type_index>"
+
+[[fields]]
+name = "task_arg_types"
+type = "std::unordered_map<::FlexFlow::slot_id_t, std::type_index>"
+
+[[fields]]
+name = "tensor_guid_slots"
+type = "std::unordered_set<::FlexFlow::TensorGuidSlotSpec>"
diff --git a/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml b/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml
new file mode 100644
index 0000000000..4b3e5b2674
--- /dev/null
+++ b/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml
@@ -0,0 +1,27 @@
+namespace = "FlexFlow"
+name = "TensorGuidSlotSpec"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "ord",
+]
+
+includes = [
+  "local-execution/slot_id_t.dtg.h",
+  "local-execution/slot_type.dtg.h",
+  "local-execution/is_grad.dtg.h",
+]
+
+[[fields]]
+name = "name"
+type = "::FlexFlow::slot_id_t"
+
+[[fields]]
+name = "slot_type"
+type = "::FlexFlow::SlotType"
+
+[[fields]]
+name = "is_grad"
+type = "::FlexFlow::IsGrad"
+
diff --git a/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml b/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml
new file mode 100644
index 0000000000..a51d6ccf1b
--- /dev/null
+++ b/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml
@@ -0,0 +1,22 @@
+namespace = "FlexFlow"
+name = "TensorGuidSpec"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "ord"
+]
+
+includes = [
+  "pcg/tensor_guid_t.dtg.h",
+  "local-execution/is_grad.dtg.h",
+]
+
+[[fields]]
+name = "tensor_guid"
+type = "::FlexFlow::tensor_guid_t"
+
+[[fields]]
+name = "is_grad"
+type = "::FlexFlow::IsGrad"
+
diff --git a/lib/local-execution/src/generic_task_impl_function.cc b/lib/local-execution/src/generic_task_impl_function.cc
new file mode 100644
index 0000000000..87d4db53e6
--- /dev/null
+++ b/lib/local-execution/src/generic_task_impl_function.cc
@@ -0,0 +1,53 @@
+#include "local-execution/generic_task_impl_function.h"
+
+namespace FlexFlow {
+
+bool GenericTaskImplFunction::operator==(
+    GenericTaskImplFunction const &other) const {
+  return this->function_ptr == other.function_ptr;
+}
+
+bool GenericTaskImplFunction::operator!=(
+    GenericTaskImplFunction const &other) const {
+  return this->function_ptr != other.function_ptr;
+}
+
+bool GenericTaskImplFunction::operator<(
+    GenericTaskImplFunction const &other) const {
+  return this->function_ptr < other.function_ptr;
+}
+
+bool GenericTaskImplFunction::operator>(
+    GenericTaskImplFunction const &other) const {
+  return this->function_ptr > other.function_ptr;
+}
+
+bool GenericTaskImplFunction::operator<=(
+    GenericTaskImplFunction const &other) const {
+  return this->function_ptr <= other.function_ptr;
+}
+
+bool GenericTaskImplFunction::operator>=(
+    GenericTaskImplFunction const &other) const {
+  return this->function_ptr >= other.function_ptr;
+}
+
+std::string format_as(GenericTaskImplFunction const &x) {
+  std::ostringstream oss;
+  oss << "<GenericTaskImplFunction";
+  oss << " function_ptr=" << reinterpret_cast<void *>(x.function_ptr);
+  oss << ">";
+  return oss.str();
+}
+std::ostream &operator<<(std::ostream &s, GenericTaskImplFunction const &x) {
+  return s << fmt::to_string(x);
+}
+
+} // namespace FlexFlow
+
+namespace std {
+size_t hash<FlexFlow::GenericTaskImplFunction>::operator()(
+    ::FlexFlow::GenericTaskImplFunction const &x) const {
+  return std::hash<decltype(x.function_ptr)>{}(x.function_ptr);
+}
+} // namespace std
diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
index d4e0467cbf..1ca422d8e1 100644
--- a/lib/local-execution/src/local_cost_estimator.cc
+++ b/lib/local-execution/src/local_cost_estimator.cc
@@ -75,7 +75,8 @@ CostDetails LocalCostEstimator::estimate_cost(
   LocalTrainingBacking local_backing(allocator,
                                      cg_builder.computation_graph,
                                      tensor_backing_map,
-                                     this->runtime_arg_config);
+                                     this->runtime_arg_config,
+                                     std::nullopt);
 
   local_backing.execute_init();
   PerLayerElapsedTime fwd = local_backing.execute_forward();
diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc
index c8d186a0fe..967f8d9ba3 100644
--- a/lib/local-execution/src/local_slots_backing.cc
+++ b/lib/local-execution/src/local_slots_backing.cc
@@ -15,6 +15,14 @@ void LocalSlotsBacking::add_per_device_op_state(
   this->per_device_op_states.insert({op_guid, device_state});
 }
 
+void LocalSlotsBacking::allocate_label_tensor(tensor_guid_t const &label_tensor,
+                                              ComputationGraph const &cg,
+                                              Allocator &allocator) {
+  GenericTensorAccessorW tensor_backing =
+      allocator.allocate_tensor(get_tensor_attrs(cg, label_tensor).shape);
+  this->tensor_mapping.insert({label_tensor, tensor_backing});
+}
+
 void LocalSlotsBacking::allocate_outgoing_tensors(
     layer_guid_t const &layer_guid,
     ComputationGraph const &computation_graph,
@@ -78,7 +86,8 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing(
   TensorSlotsBacking mapping;
   int num_inputs = 0;
   for (auto const &tensor_binding : binding.get_tensor_bindings()) {
-    if (tensor_binding.first.is_grad == IsGrad::NO && tensor_binding.second.role == TensorRole::INPUT) {
+    if (tensor_binding.first.is_grad == IsGrad::NO &&
+        tensor_binding.second.role == TensorRole::INPUT) {
       num_inputs += 1;
     }
   }
@@ -90,7 +99,7 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing(
     int weight_adjusted_idx = 0;
     switch (tensor_spec.role) {
       case TensorRole::WEIGHT:
-	weight_adjusted_idx = num_inputs;
+        weight_adjusted_idx = num_inputs;
       case TensorRole::INPUT:
         assert(contains_key(this->input_tensor_slots, op_guid));
         tensor_guids = this->input_tensor_slots.at(op_guid);
@@ -106,14 +115,30 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing(
     }
 
     IsGrad is_grad = slot_grad_id.is_grad;
-    GenericTensorAccessorW tensor_backing =
-        this->get_tensor_backing(tensor_guids.at(weight_adjusted_idx + tensor_spec.idx), is_grad);
+    GenericTensorAccessorW tensor_backing = this->get_tensor_backing(
+        tensor_guids.at(weight_adjusted_idx + tensor_spec.idx), is_grad);
 
     mapping.insert({slot_grad_id, tensor_backing});
   }
   return mapping;
 }
 
+TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing(
+    TaskBinding const &binding) const {
+  TensorSlotsBacking mapping;
+
+  for (auto const &tensor_binding : binding.get_tensor_bindings()) {
+    SlotGradId slot_grad_id = tensor_binding.first;
+    TensorGuidSpec tensor_spec = tensor_binding.second;
+
+    GenericTensorAccessorW accessor =
+        this->get_tensor_backing(tensor_spec.tensor_guid, slot_grad_id.is_grad);
+    mapping.insert({slot_grad_id, accessor});
+  }
+
+  return mapping;
+}
+
 ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing(
     OpTaskBinding const &binding, layer_guid_t const &op_guid) const {
   ArgSlotsBacking mapping;
@@ -135,6 +160,24 @@ ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing(
   return mapping;
 }
 
+ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing(
+    TaskBinding const &binding) const {
+  ArgSlotsBacking mapping;
+  for (auto const &arg_binding : binding.get_arg_bindings()) {
+    slot_id_t arg_slot = arg_binding.first;
+    TaskArgSpec task_arg_spec = arg_binding.second;
+
+    mapping.insert({arg_slot,
+                    task_arg_spec.visit<ConcreteArgSpec>(overload{
+                        [&](RuntimeArgRefSpec const &s) {
+                          return this->resolve_runtime_arg_ref_spec(s);
+                        },
+                        [](ConcreteArgSpec const &s) { return s; },
+                    })});
+  }
+  return mapping;
+}
+
 ConcreteArgSpec LocalSlotsBacking::resolve_op_arg_ref_spec(
     OpArgRefSpec const &op_arg_ref_spec, layer_guid_t const &op_guid) const {
   if (op_arg_ref_spec.holds<DeviceSpecificDeviceStates>()) {
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index a2ee06a95a..f54d0ddaad 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -1,4 +1,5 @@
 #include "local-execution/local_training_backing.h"
+#include "local-execution/loss_functions.h"
 #include "local-execution/task_signature_impl.h"
 #include "utils/containers/reversed.h"
 #include "utils/exception.h"
@@ -9,10 +10,12 @@ LocalTrainingBacking::LocalTrainingBacking(
     Allocator const &allocator,
     ComputationGraph const &computation_graph,
     TensorBackingMap const &tensor_backing_mapping,
-    RuntimeArgConfig const &runtime_arg_config)
+    RuntimeArgConfig const &runtime_arg_config,
+    std::optional<ModelTrainingInstance> const &training_instance)
     : allocator(allocator), computation_graph(computation_graph),
       local_slots_backing(tensor_backing_mapping, runtime_arg_config),
-      task_registry(empty_task_registry()) {
+      task_registry(empty_task_registry()),
+      training_instance(training_instance) {
 
   for (layer_guid_t const &node : topological_ordering(computation_graph)) {
     ComputationGraphOpAttrs attrs =
@@ -25,6 +28,13 @@ LocalTrainingBacking::LocalTrainingBacking(
     // register tasks
     register_tasks_for_layer(this->task_registry, node, attrs);
   }
+
+  if (this->training_instance.has_value()) {
+    this->local_slots_backing.allocate_label_tensor(
+        this->training_instance.value().label_tensor,
+        computation_graph,
+        this->allocator);
+  }
 }
 
 DeviceSpecificDeviceStates
@@ -56,7 +66,7 @@ void LocalTrainingBacking::execute_init() {
 
       OpTaskInvocation invocation = init(attrs);
       TaskArgumentAccessor accessor =
-          this->get_task_arg_accessor(invocation, operator_node);
+          this->get_op_task_arg_accessor(invocation, operator_node);
       DeviceSpecificDeviceStates device_state =
           this->call_init_task_impl(invocation.task_id, accessor);
       this->local_slots_backing.add_per_device_op_state(operator_node,
@@ -67,6 +77,7 @@ void LocalTrainingBacking::execute_init() {
 
 PerLayerElapsedTime LocalTrainingBacking::execute_forward() {
   PerLayerElapsedTime per_op_elapsed_time;
+
   for (layer_guid_t const &operator_node :
        topological_ordering(this->computation_graph)) {
     if (this->task_registry.forward_task_ids.at(operator_node).has_value()) {
@@ -75,17 +86,35 @@ PerLayerElapsedTime LocalTrainingBacking::execute_forward() {
 
       OpTaskInvocation invocation = forward(attrs);
       TaskArgumentAccessor accessor =
-          this->get_task_arg_accessor(invocation, operator_node);
+          this->get_op_task_arg_accessor(invocation, operator_node);
       std::optional<float> elapsed_time =
           this->call_task_impl(invocation.task_id, accessor);
       per_op_elapsed_time.insert({operator_node, elapsed_time});
     }
   }
+
   return per_op_elapsed_time;
 }
 
 PerLayerElapsedTime LocalTrainingBacking::execute_backward() {
   PerLayerElapsedTime per_op_elapsed_time;
+
+  // compute loss
+  if (this->training_instance.has_value()) {
+    ModelTrainingInstance unwrapped_training_instance =
+        training_instance.value();
+    TaskInvocation loss_invocation =
+        backward(unwrapped_training_instance.loss_attrs,
+                 unwrapped_training_instance.logit_tensor,
+                 unwrapped_training_instance.label_tensor);
+    assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation));
+    TaskArgumentAccessor loss_accessor =
+        this->get_task_arg_accessor(loss_invocation);
+    TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl();
+    loss_impl_fn.get<GenericTaskImplFunction>().function_ptr(loss_accessor);
+  }
+
+  // backward through computation graph
   for (layer_guid_t const &operator_node :
        reversed(topological_ordering(this->computation_graph))) {
     if (this->task_registry.backward_task_ids.at(operator_node).has_value()) {
@@ -94,7 +123,7 @@ PerLayerElapsedTime LocalTrainingBacking::execute_backward() {
 
       OpTaskInvocation invocation = backward(attrs);
       TaskArgumentAccessor accessor =
-          this->get_task_arg_accessor(invocation, operator_node);
+          this->get_op_task_arg_accessor(invocation, operator_node);
       std::optional<float> elapsed_time =
           this->call_task_impl(invocation.task_id, accessor);
       per_op_elapsed_time.insert({operator_node, elapsed_time});
@@ -108,6 +137,17 @@ void LocalTrainingBacking::execute_update() {
 }
 
 TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor(
+    TaskInvocation const &invocation) const {
+  TensorSlotsBacking tensor_slots_backing =
+      this->local_slots_backing.construct_tensor_slots_backing(
+          invocation.binding);
+  ArgSlotsBacking arg_slots_backing =
+      this->local_slots_backing.construct_arg_slots_backing(invocation.binding);
+  return TaskArgumentAccessor::create<LocalTaskArgumentAccessor>(
+      this->allocator, tensor_slots_backing, arg_slots_backing);
+}
+
+TaskArgumentAccessor LocalTrainingBacking::get_op_task_arg_accessor(
     OpTaskInvocation const &invocation, layer_guid_t const &op_guid) const {
   TensorSlotsBacking tensor_slots_backing =
       this->local_slots_backing.construct_tensor_slots_backing(
diff --git a/lib/runtime/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc
similarity index 63%
rename from lib/runtime/src/loss_functions.cc
rename to lib/local-execution/src/loss_functions.cc
index b0d5ac2029..6b23d5da51 100644
--- a/lib/runtime/src/loss_functions.cc
+++ b/lib/local-execution/src/loss_functions.cc
@@ -13,56 +13,44 @@
  * limitations under the License.
  */
 
-#include "loss_functions.h"
+#include "op-attrs/ops/loss_functions.h"
 #include "kernels/loss_function_kernels.h"
-#include "legion.h"
-#include "runtime/profiling.h"
-#include "task_spec/task_argument_accessor.h"
+#include "local-execution/loss_functions.h"
+#include "local-execution/profiling.h"
 
 namespace FlexFlow {
 
-enum LossSlots {
-  LOGIT_GRAD,
-  LOGIT,
-  LABEL,
-  LOSS_ATTRS,
-  BATCH_SIZE,
-  PROFILING_SETTINGS
-};
+enum Slots { LOGIT, LABEL, ATTRS, PROFILING };
 
-TaskInvocation backward_invocation(LossAttrs const &attrs,
-                                   EnableProfiling enable_profiling,
-                                   parallel_tensor_guid_t logit,
-                                   parallel_tensor_guid_t label) {
-  auto binding = IndexTaskBinding{LOGIT};
-  StandardTypedTaskArg<LossAttrs> arg = attrs;
-  binding.bind_arg(LOSS_ATTRS, attrs);
-  binding.bind(LOGIT, logit);
-  binding.bind(LABEL, label);
-  binding.bind(LOGIT_GRAD, grad(logit));
-  binding.bind_arg(PROFILING_SETTINGS, profiling_settings());
+TaskSignature get_loss_bwd_signature() {
+  TaskSignature sig = make_empty_task_signature();
+  add_slot(sig, LOGIT, IsGrad::NO);
+  add_slot(sig, LABEL, IsGrad::NO);
+  add_slot(sig, LOGIT, IsGrad::YES);
+  add_arg_slot<LossAttrs>(sig, ATTRS);
+  add_arg_slot<ProfilingSettings>(sig, PROFILING);
+  return sig;
+}
+
+TaskInvocation
+    backward(LossAttrs const &attrs, tensor_guid_t logit, tensor_guid_t label) {
+  TaskBinding b;
+  b.bind(LOGIT, TensorGuidSpec{logit, IsGrad::NO});
+  b.bind(LABEL, TensorGuidSpec{label, IsGrad::NO});
+  b.bind(LOGIT, TensorGuidSpec{logit, IsGrad::YES});
+  b.bind_arg(ATTRS, attrs);
+  b.bind_arg(PROFILING, profiling_settings());
 
-  /* if ((logit_domain != part_domain) || (label_domain != part_domain)) { */ // TODO @lockshaw make sure this is still checked
-  /*   fprintf(stderr, */
-  /*           "Encounter inconsistency in parallelizing loss computation"); */
-  /*   assert(false); */
-  /* } */
-  return {LOSS_BWD_TASK_ID, binding};
+  return {task_id_t::LOSS_BWD_TASK_ID, b};
 }
 
-static void
-    loss_backward_task(Legion::Task const *task,
-                       std::vector<Legion::PhysicalRegion> const &regions,
-                       Legion::Context ctx,
-                       Legion::Runtime *runtime) {
-  TaskArgumentAccessor acc(task, regions, ctx, runtime);
-  auto attrs = acc.get_argument<LossAttrs>(LOSS_ATTRS);
-  auto profiling_settings =
-      acc.get_argument<ProfilingSettings>(PROFILING_SETTINGS);
-  auto batch_size = acc.get_argument<int>(BATCH_SIZE);
-  auto logit_grad = acc.get_tensor<Permissions::RW>(LOGIT_GRAD);
+static void backward_task_impl(TaskArgumentAccessor const &acc) {
+  auto attrs = acc.get_argument<LossAttrs>(ATTRS);
+  auto profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  auto logit_grad = acc.get_tensor_grad<Permissions::RW>(LOGIT);
   auto logit = acc.get_tensor<Permissions::RO>(LOGIT);
   auto label = acc.get_tensor<Permissions::RO>(LABEL);
+  int batch_size = label.shape.at(ff_dim_t{0});
 
   LossFunction loss_type = get_loss_function(attrs);
   float scale_factor = 1.0f / batch_size;
@@ -73,7 +61,7 @@ static void
 
   if (loss_type == LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY) {
     // assertion the outter-most dim is replica dim and replica degree is 1
-    auto scce_attrs = get<SparseCategoricalCrossEntropyLossAttrs>(attrs);
+    auto scce_attrs = attrs.get<SparseCategoricalCrossEntropyLossAttrs>();
     size_t ndim = logit.shape.num_dims();
     assert(logit.shape.at(legion_dim_t(ndim - 1)) == 1);
     int num_samples = logit.shape.at(legion_dim_t(ndim - 2));
@@ -86,19 +74,19 @@ static void
               ndim - 1)); // TODO FIXME something seems wrong here, isn't the
                           // numerator guaranteed to be 1?
     }
-    assert(label.shape.sub_shape(legion_dim_t(1), nullopt) ==
-           logit.shape.sub_shape(legion_dim_t(1), nullopt));
+    assert(label.shape.sub_shape(legion_dim_t(1), std::nullopt) ==
+           logit.shape.sub_shape(legion_dim_t(1), std::nullopt));
     assert(k * label.shape.at(legion_dim_t(ndim - 1)) ==
            logit.shape.at(legion_dim_t(ndim - 1)));
     assert(label.shape.at(legion_dim_t(0)) == 1);
 
     profile(sparse_categorical_crossentropy_loss_backward_kernel,
-            profiling_settings,
+            profiling,
             "[SparseCategoricalCrossEntropyLoss] backward_time = %.2lfms\n",
             get_float_ptr(logit_grad),
             get_float_ptr(logit),
             get_int32_ptr(label),
-            logit.shape.get_volume(),
+            get_volume(logit.shape),
             get_volume(logit_grad.shape),
             num_samples,
             num_classes,
@@ -115,7 +103,7 @@ static void
     switch (loss_type) {
       case LossFunction::CATEGORICAL_CROSSENTROPY: {
         profile(categorical_crossentropy_loss_backward_kernel,
-                profiling_settings,
+                profiling,
                 "[CategoricalCrossEntropyLoss] backward_time = %.2lfms\n",
                 get_float_ptr(logit_grad),
                 get_float_ptr(logit),
@@ -127,7 +115,7 @@ static void
       }
       case LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE: {
         profile(mean_squared_error_avg_loss_backward_kernel,
-                profiling_settings,
+                profiling,
                 "[MeanSquaredErrorAvgLoss] backward_time = %.2lfms\n",
                 get_float_ptr(logit_grad),
                 get_float_ptr(logit),
@@ -139,7 +127,7 @@ static void
       }
       case LossFunction::IDENTITY: {
         profile(identity_loss_backward_kernel,
-                profiling_settings,
+                profiling,
                 "[IdentityLoss] backward_time = %.2lfms\n",
                 get_float_ptr(logit_grad),
                 get_float_ptr(logit),
@@ -156,16 +144,8 @@ static void
   }
 }
 
-template <>
-void register_task<LOSS_BWD_TASK_ID>() {
-  TaskSignature sig;
-  sig.add_arg_slot<LossAttrs>(LOSS_ATTRS);
-  sig.add_arg_slot<ProfilingSettings>(PROFILING_SETTINGS);
-  sig.add_slot(LOGIT, {SlotType::TENSOR, Permissions::RO});
-  sig.add_slot(LABEL, {SlotType::TENSOR, Permissions::RO});
-  sig.add_slot(LOGIT_GRAD, {SlotType::TENSOR, Permissions::RW});
-
-  register_task(LOSS_BWD_TASK_ID, "Loss Backward", sig, loss_backward_task);
+TaskImplFunction get_loss_bwd_task_impl() {
+  return TaskImplFunction{GenericTaskImplFunction{backward_task_impl}};
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/ops/attention.cc b/lib/local-execution/src/ops/attention.cc
index eebef9039d..8ede2cb38b 100644
--- a/lib/local-execution/src/ops/attention.cc
+++ b/lib/local-execution/src/ops/attention.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "local-execution/ops/attention.h"
+#include "attention.h"
 #include "kernels/attention_kernels.h"
 #include "local-execution/op_task_signature.h"
 #include "op-attrs/ops/attention.h"
diff --git a/lib/local-execution/include/local-execution/ops/attention.h b/lib/local-execution/src/ops/attention.h
similarity index 100%
rename from lib/local-execution/include/local-execution/ops/attention.h
rename to lib/local-execution/src/ops/attention.h
diff --git a/lib/local-execution/src/task_invocation.cc b/lib/local-execution/src/task_invocation.cc
new file mode 100644
index 0000000000..c64af5332e
--- /dev/null
+++ b/lib/local-execution/src/task_invocation.cc
@@ -0,0 +1,49 @@
+#include "local-execution/task_invocation.h"
+#include "utils/containers/contains_key.h"
+
+namespace FlexFlow {
+
+void TaskBinding::bind(int name, TensorGuidSpec const &tensor_guid_spec) {
+  this->bind(slot_id_t{name}, tensor_guid_spec);
+}
+
+void TaskBinding::bind(slot_id_t name, TensorGuidSpec const &tensor_guid_spec) {
+  this->tensor_bindings.insert(
+      {SlotGradId{name, tensor_guid_spec.is_grad}, tensor_guid_spec});
+}
+
+void TaskBinding::insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec) {
+  assert(!contains_key(this->arg_bindings, name));
+  this->arg_bindings.insert({name, arg_spec});
+}
+
+bool TaskBinding::operator==(TaskBinding const &other) const {
+  return this->tie() == other.tie();
+}
+
+bool TaskBinding::operator!=(TaskBinding const &other) const {
+  return this->tie() != other.tie();
+}
+
+std::tuple<std::unordered_map<SlotGradId, TensorGuidSpec> const &,
+           std::unordered_map<slot_id_t, TaskArgSpec> const &>
+    TaskBinding::tie() const {
+  return std::tie(this->tensor_bindings, this->arg_bindings);
+}
+
+std::unordered_map<SlotGradId, TensorGuidSpec> const &
+    TaskBinding::get_tensor_bindings() const {
+  return this->tensor_bindings;
+}
+
+std::unordered_map<slot_id_t, TaskArgSpec> const &
+    TaskBinding::get_arg_bindings() const {
+  return this->arg_bindings;
+}
+
+bool is_invocation_valid(TaskSignature const &sig, TaskInvocation const &inv) {
+  // TODO: implement signature checking
+  return true;
+}
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/task_signature.cc b/lib/local-execution/src/task_signature.cc
new file mode 100644
index 0000000000..3bba9e2c8a
--- /dev/null
+++ b/lib/local-execution/src/task_signature.cc
@@ -0,0 +1,25 @@
+#include "local-execution/task_signature.h"
+
+namespace FlexFlow {
+
+TaskSignature make_empty_task_signature() {
+  return TaskSignature(std::nullopt, {}, {});
+}
+
+void add_slot(TaskSignature &task_signature,
+              int name,
+              IsGrad is_grad,
+              SlotType slot_type) {
+  add_slot(task_signature, slot_id_t{name}, is_grad, slot_type);
+}
+
+void add_slot(TaskSignature &task_signature,
+              slot_id_t name,
+              IsGrad is_grad,
+              SlotType slot_type) {
+  TensorGuidSlotSpec tensor_guid_slot_spec =
+      TensorGuidSlotSpec{name, slot_type, is_grad};
+  task_signature.tensor_guid_slots.insert(tensor_guid_slot_spec);
+}
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/task_signature_impl.cc b/lib/local-execution/src/task_signature_impl.cc
index ca428aad25..16b7870601 100644
--- a/lib/local-execution/src/task_signature_impl.cc
+++ b/lib/local-execution/src/task_signature_impl.cc
@@ -1,5 +1,5 @@
 #include "local-execution/task_signature_impl.h"
-#include "local-execution/ops/attention.h"
+#include "ops/attention.h"
 #include "ops/batch_matmul.h"
 #include "ops/batch_norm.h"
 #include "ops/cast.h"
diff --git a/lib/local-execution/test/src/test_task_registry.cc b/lib/local-execution/test/src/test_task_registry.cc
index fa3b068425..2c3a6c1d63 100644
--- a/lib/local-execution/test/src/test_task_registry.cc
+++ b/lib/local-execution/test/src/test_task_registry.cc
@@ -1,7 +1,6 @@
 #include "doctest/doctest.h"
 #include "kernels/local_cuda_allocator.h"
 #include "local-execution/local_cost_estimator.h"
-#include "local-execution/ops/attention.h"
 #include "local-execution/task_signature_impl.h"
 #include "pcg/computation_graph_builder.h"
 #include "utils/fmt/optional.h"
diff --git a/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml b/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml
new file mode 100644
index 0000000000..8a4f38839c
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml
@@ -0,0 +1,22 @@
+namespace = "FlexFlow"
+name = "LossAttrs"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "json",
+  "fmt",
+]
+
+includes = [
+  "op-attrs/ops/sparse_categorical_ce_loss_attrs.dtg.h",
+  "op-attrs/ops/other_loss_attrs.dtg.h"
+]
+
+[[values]]
+type = "::FlexFlow::SparseCategoricalCrossEntropyLossAttrs"
+key = "sparse_categorical_ce_loss_attrs"
+
+[[values]]
+type = "::FlexFlow::OtherLossAttrs"
+key = "other_loss_attrs"
diff --git a/lib/op-attrs/include/op-attrs/ops/loss_function.enum.toml b/lib/op-attrs/include/op-attrs/ops/loss_function.enum.toml
new file mode 100644
index 0000000000..b9cd13eabf
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/ops/loss_function.enum.toml
@@ -0,0 +1,23 @@
+namespace = "FlexFlow"
+name = "LossFunction"
+features = [
+  "hash",
+  "json",
+  "rapidcheck",
+  "fmt",
+]
+
+[[values]]
+name = "CATEGORICAL_CROSSENTROPY"
+
+[[values]]
+name = "SPARSE_CATEGORICAL_CROSSENTROPY"
+
+[[values]]
+name = "MEAN_SQUARED_ERROR_AVG_REDUCE"
+
+[[values]]
+name = "MEAN_SQUARED_ERROR_SUM_REDUCE"
+
+[[values]]
+name = "IDENTITY"
diff --git a/lib/op-attrs/include/op-attrs/ops/loss_functions.h b/lib/op-attrs/include/op-attrs/ops/loss_functions.h
index 58d372d9e5..9fb0597197 100644
--- a/lib/op-attrs/include/op-attrs/ops/loss_functions.h
+++ b/lib/op-attrs/include/op-attrs/ops/loss_functions.h
@@ -2,74 +2,16 @@
 #define _FLEXFLOW_OP_ATTRS_INCLUDE_OP_ATTRS_OPS_LOSS_FUNCTIONS_H
 
 #include "core.h"
-#include "utils/exception.h"
-#include "utils/visitable.h"
-#include <variant>
+#include "loss_attrs.dtg.h"
+#include "loss_function.dtg.h"
+#include "other_loss_attrs.dtg.h"
+#include "sparse_categorical_ce_loss_attrs.dtg.h"
 
 namespace FlexFlow {
 
-enum class LossFunction {
-  CATEGORICAL_CROSSENTROPY,
-  SPARSE_CATEGORICAL_CROSSENTROPY,
-  MEAN_SQUARED_ERROR_AVG_REDUCE,
-  MEAN_SQUARED_ERROR_SUM_REDUCE,
-  IDENTITY
-};
-
-LossFunction parse_loss_function_name(std::string const &);
-
-struct SparseCategoricalCrossEntropyLossAttrs {
-  req<bool> replace_labels; // for aggregate_spec: More predictions than labels
-};
-FF_VISITABLE_STRUCT(SparseCategoricalCrossEntropyLossAttrs, replace_labels);
-CHECK_VALID_OP_ATTR(SparseCategoricalCrossEntropyLossAttrs);
-
-struct OtherLossAttrs {
-  req<LossFunction> loss_type;
-};
-FF_VISITABLE_STRUCT(OtherLossAttrs, loss_type);
-CHECK_VALID_OP_ATTR(OtherLossAttrs);
-
-using LossAttrs =
-    std::variant<SparseCategoricalCrossEntropyLossAttrs, OtherLossAttrs>;
-
-LossFunction get_loss_function(OtherLossAttrs const &);
-LossFunction get_loss_function(SparseCategoricalCrossEntropyLossAttrs const &);
 LossFunction get_loss_function(LossAttrs const &);
+LossFunction parse_loss_name(std::string const &raw_name);
 
 } // namespace FlexFlow
 
-namespace fmt {
-
-template <>
-struct formatter<::FlexFlow::LossFunction> : formatter<string_view> {
-  template <typename FormatContext>
-  auto format(::FlexFlow::LossFunction d, FormatContext &ctx) const
-      -> decltype(ctx.out()) {
-    using namespace FlexFlow;
-
-    string_view name = "unknown";
-    switch (d) {
-      case LossFunction::CATEGORICAL_CROSSENTROPY:
-        name = "CategoricalCrossEntropy";
-        break;
-      case LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY:
-        name = "SparseCategoricalCrossEntropy";
-        break;
-      case LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE:
-        name = "MeanSquaredErrorAvgReduce";
-        break;
-      case LossFunction::MEAN_SQUARED_ERROR_SUM_REDUCE:
-        name = "MeanSquaredErrorSumReduce";
-        break;
-      case LossFunction::IDENTITY:
-        name = "Identity";
-        break;
-    }
-    return formatter<string_view>::format(name, ctx);
-  }
-};
-
-} // namespace fmt
-
 #endif
diff --git a/lib/op-attrs/include/op-attrs/ops/other_loss_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/other_loss_attrs.struct.toml
new file mode 100644
index 0000000000..81055f5835
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/ops/other_loss_attrs.struct.toml
@@ -0,0 +1,18 @@
+namespace = "FlexFlow"
+name = "OtherLossAttrs"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "json",
+  "rapidcheck",
+  "fmt",
+]
+
+includes = [
+  "op-attrs/ops/loss_function.dtg.h"
+]
+
+[[fields]]
+name = "loss_type"
+type = "::FlexFlow::LossFunction"
diff --git a/lib/op-attrs/include/op-attrs/ops/sparse_categorical_ce_loss_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/sparse_categorical_ce_loss_attrs.struct.toml
new file mode 100644
index 0000000000..21378a1154
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/ops/sparse_categorical_ce_loss_attrs.struct.toml
@@ -0,0 +1,14 @@
+namespace = "FlexFlow"
+name = "SparseCategoricalCrossEntropyLossAttrs"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "json",
+  "rapidcheck",
+  "fmt",
+]
+
+[[fields]]
+name = "replace_labels"
+type = "bool"
diff --git a/lib/op-attrs/src/loss_functions.cc b/lib/op-attrs/src/loss_functions.cc
index 094e117d77..cae88be453 100644
--- a/lib/op-attrs/src/loss_functions.cc
+++ b/lib/op-attrs/src/loss_functions.cc
@@ -1,27 +1,18 @@
 #include "op-attrs/ops/loss_functions.h"
 #include "utils/containers/transform.h"
+#include "utils/exception.h"
+#include "utils/overload.h"
 #include <algorithm>
 #include <cassert>
 
 namespace FlexFlow {
 
-LossFunction get_loss_type(OtherLossAttrs const &attrs) {
-  return attrs.loss_type;
-}
-LossFunction
-    get_loss_type(SparseCategoricalCrossEntropyLossAttrs const &attrs) {
-  return LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY;
-}
-
-struct GetLossFunction {
-  template <typename T>
-  LossFunction operator()(T const &t) {
-    return get_loss_type(t);
-  }
-};
-
-LossFunction get_loss_type(LossAttrs const &attrs) {
-  return visit(GetLossFunction{}, attrs);
+LossFunction get_loss_function(LossAttrs const &attrs) {
+  return attrs.visit<LossFunction>(
+      overload{[&](SparseCategoricalCrossEntropyLossAttrs const &s) {
+                 return LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY;
+               },
+               [&](OtherLossAttrs const &s) { return s.loss_type; }});
 }
 
 LossFunction parse_loss_name(std::string const &raw_name) {

From b56c046b3bc44586bae96b59476b6c384f922837 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Tue, 27 Aug 2024 04:58:33 -0700
Subject: [PATCH 03/91] Add cuda test for loss function

---
 lib/kernels/CMakeLists.txt                    |  2 +-
 lib/kernels/include/kernels/array_shape.h     | 10 ++-
 lib/kernels/src/array_shape.cc                | 36 +++++++-
 lib/kernels/src/cuda/cuda_helper.cu           |  6 ++
 lib/kernels/src/device.h                      |  1 +
 .../local-execution/local_slots_backing.h     |  4 -
 .../src/local_slots_backing.cc                |  8 --
 .../src/local_training_backing.cc             | 11 ++-
 lib/local-execution/src/loss_functions.cc     | 22 ++---
 lib/local-execution/src/ops/element_unary.cc  |  8 +-
 .../src/task_signature_impl.cc                |  4 +-
 .../test/src/test_loss_function.cc            | 88 +++++++++++++++++++
 12 files changed, 159 insertions(+), 41 deletions(-)
 create mode 100644 lib/local-execution/test/src/test_loss_function.cc

diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt
index 54fa3c9583..baac58f8e3 100644
--- a/lib/kernels/CMakeLists.txt
+++ b/lib/kernels/CMakeLists.txt
@@ -8,7 +8,7 @@ file(GLOB_RECURSE SRC
      LIST_DIRECTORIES False
      src/*.cc
      src/cuda/cuda_helper.cu
-     src/cuda/loss_functions_kernels.cu
+     src/cuda/loss_function_kernels.cu
      src/cuda/ops/*.cu
      )
 
diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h
index 5de9fae7ad..c95c447574 100644
--- a/lib/kernels/include/kernels/array_shape.h
+++ b/lib/kernels/include/kernels/array_shape.h
@@ -42,9 +42,13 @@ struct ArrayShape {
   std::optional<std::size_t> at_maybe(legion_dim_t) const;
   std::optional<std::size_t> at_maybe(ff_dim_t) const;
 
-  ArrayShape
-      sub_shape(std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
-                std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const;
+  ArrayShape sub_shape(legion_dim_t start, ff_dim_t end) const;
+
+  ArrayShape sub_shape(std::optional<ff_dim_t> start,
+                       std::optional<ff_dim_t> end) const;
+
+  ArrayShape sub_shape(std::optional<legion_dim_t> start,
+                       std::optional<legion_dim_t> end) const;
 
 public:
   LegionTensorDims dims;
diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
index d5e2f1167d..bf80c6b5c1 100644
--- a/lib/kernels/src/array_shape.cc
+++ b/lib/kernels/src/array_shape.cc
@@ -50,12 +50,42 @@ std::size_t ArrayShape::at(ff_dim_t idx) const {
   return dims.at(legion_dim_from_ff_dim(idx, this->num_dims()));
 }
 
-ArrayShape ArrayShape::sub_shape(
-    std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
-    std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const {
+// ArrayShape ArrayShape::sub_shape(
+//     std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
+//     std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const {
+//   NOT_IMPLEMENTED();
+// }
+
+ArrayShape ArrayShape::sub_shape(legion_dim_t start, ff_dim_t end) const {
   NOT_IMPLEMENTED();
 }
 
+ArrayShape ArrayShape::sub_shape(std::optional<ff_dim_t> start,
+                                 std::optional<ff_dim_t> end) const {
+  std::vector<size_t> new_shape;
+  ff_dim_t start_idx = start.value_or(ff_dim_t{0});
+  ff_dim_t end_idx = end.value_or(ff_dim_t{this->num_dims()});
+
+  while (start_idx < end_idx) {
+    new_shape.push_back(this->at(start_idx));
+    start_idx = ff_dim_t{start_idx.value + 1};
+  }
+  return ArrayShape{new_shape};
+}
+
+ArrayShape ArrayShape::sub_shape(std::optional<legion_dim_t> start,
+                                 std::optional<legion_dim_t> end) const {
+  std::vector<size_t> new_shape;
+  legion_dim_t start_idx = start.value_or(legion_dim_t{0});
+  legion_dim_t end_idx = end.value_or(legion_dim_t{this->num_dims()});
+
+  while (start_idx < end_idx) {
+    new_shape.push_back(this->at(start_idx));
+    start_idx = add_to_legion_dim(start_idx, 1);
+  }
+  return ArrayShape{new_shape};
+}
+
 std::optional<std::size_t> ArrayShape::at_maybe(legion_dim_t index) const {
   if (index.value < dims.size()) {
     return dims.at(index);
diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu
index 2ff02038f4..5a303ca15e 100644
--- a/lib/kernels/src/cuda/cuda_helper.cu
+++ b/lib/kernels/src/cuda/cuda_helper.cu
@@ -35,6 +35,12 @@ __global__ void scale_kernel(float *ptr, coord_t size, float a, float b) {
   }
 }
 
+__global__ void scale_kernel(float *ptr, unsigned long size, float a, float b) {
+  CUDA_KERNEL_LOOP(i, size) {
+    ptr[i] = (b - a) * ptr[i] + a;
+  }
+}
+
 __global__ void ones_kernel(float *ptr, coord_t size) {
   CUDA_KERNEL_LOOP(i, size) {
     ptr[i] = 1.0f;
diff --git a/lib/kernels/src/device.h b/lib/kernels/src/device.h
index ceff2f92ff..e32805fde3 100644
--- a/lib/kernels/src/device.h
+++ b/lib/kernels/src/device.h
@@ -71,6 +71,7 @@ inline int GET_BLOCKS(int const N) {
 }
 
 __global__ void scale_kernel(float *ptr, size_t size, float a, float b);
+__global__ void scale_kernel(float *ptr, unsigned long size, float a, float b);
 
 __global__ void ones_kernel(float *ptr, size_t size);
 
diff --git a/lib/local-execution/include/local-execution/local_slots_backing.h b/lib/local-execution/include/local-execution/local_slots_backing.h
index 312a13cc01..1f35bdd304 100644
--- a/lib/local-execution/include/local-execution/local_slots_backing.h
+++ b/lib/local-execution/include/local-execution/local_slots_backing.h
@@ -20,9 +20,6 @@ struct LocalSlotsBacking {
 public:
   void add_per_device_op_state(layer_guid_t const &,
                                DeviceSpecificDeviceStates const &);
-  void allocate_label_tensor(tensor_guid_t const &,
-                             ComputationGraph const &,
-                             Allocator &);
   void allocate_outgoing_tensors(layer_guid_t const &,
                                  ComputationGraph const &,
                                  Allocator &);
@@ -40,7 +37,6 @@ struct LocalSlotsBacking {
   GenericTensorAccessorW const &get_tensor_backing(tensor_guid_t const &,
                                                    IsGrad) const;
 
-private:
   bool is_tensor_allocated(tensor_guid_t const &) const;
   bool is_gradient_tensor_allocated(tensor_guid_t const &) const;
 
diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc
index 967f8d9ba3..787c7dda86 100644
--- a/lib/local-execution/src/local_slots_backing.cc
+++ b/lib/local-execution/src/local_slots_backing.cc
@@ -15,14 +15,6 @@ void LocalSlotsBacking::add_per_device_op_state(
   this->per_device_op_states.insert({op_guid, device_state});
 }
 
-void LocalSlotsBacking::allocate_label_tensor(tensor_guid_t const &label_tensor,
-                                              ComputationGraph const &cg,
-                                              Allocator &allocator) {
-  GenericTensorAccessorW tensor_backing =
-      allocator.allocate_tensor(get_tensor_attrs(cg, label_tensor).shape);
-  this->tensor_mapping.insert({label_tensor, tensor_backing});
-}
-
 void LocalSlotsBacking::allocate_outgoing_tensors(
     layer_guid_t const &layer_guid,
     ComputationGraph const &computation_graph,
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index f54d0ddaad..98bfe7683e 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -1,6 +1,8 @@
 #include "local-execution/local_training_backing.h"
 #include "local-execution/loss_functions.h"
 #include "local-execution/task_signature_impl.h"
+#include "utils/containers/contains.h"
+#include "utils/containers/contains_key.h"
 #include "utils/containers/reversed.h"
 #include "utils/exception.h"
 
@@ -30,10 +32,11 @@ LocalTrainingBacking::LocalTrainingBacking(
   }
 
   if (this->training_instance.has_value()) {
-    this->local_slots_backing.allocate_label_tensor(
-        this->training_instance.value().label_tensor,
-        computation_graph,
-        this->allocator);
+    // label and logit tensor should be allocated
+    assert(this->local_slots_backing.is_tensor_allocated(
+        this->training_instance.value().label_tensor));
+    assert(this->local_slots_backing.is_tensor_allocated(
+        this->training_instance.value().logit_tensor));
   }
 }
 
diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc
index 6b23d5da51..771d175a7d 100644
--- a/lib/local-execution/src/loss_functions.cc
+++ b/lib/local-execution/src/loss_functions.cc
@@ -50,7 +50,8 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
   auto logit_grad = acc.get_tensor_grad<Permissions::RW>(LOGIT);
   auto logit = acc.get_tensor<Permissions::RO>(LOGIT);
   auto label = acc.get_tensor<Permissions::RO>(LABEL);
-  int batch_size = label.shape.at(ff_dim_t{0});
+  int batch_size = logit.shape.at(legion_dim_t{1});
+  // assuming logit shape is [parallel dim(?), batch dim, num classes]
 
   LossFunction loss_type = get_loss_function(attrs);
   float scale_factor = 1.0f / batch_size;
@@ -60,19 +61,18 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
   }
 
   if (loss_type == LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY) {
-    // assertion the outter-most dim is replica dim and replica degree is 1
+    // label shape is [parallel dim(?), batch dim, 1]
     auto scce_attrs = attrs.get<SparseCategoricalCrossEntropyLossAttrs>();
     size_t ndim = logit.shape.num_dims();
-    assert(logit.shape.at(legion_dim_t(ndim - 1)) == 1);
-    int num_samples = logit.shape.at(legion_dim_t(ndim - 2));
-    int num_classes = logit.shape.get_volume() / num_samples;
+    int num_classes = logit.shape.at(legion_dim_t{0});
     assert(logit_grad.shape == logit.shape);
     int k = 1;
     if (scce_attrs.replace_labels) {
       k = logit.shape.at(legion_dim_t(ndim - 1)) /
           label.shape.at(legion_dim_t(
               ndim - 1)); // TODO FIXME something seems wrong here, isn't the
-                          // numerator guaranteed to be 1?
+                          // numerator guaranteed to be 1? <--- this is not the
+                          // case because of the potential parallel dim
     }
     assert(label.shape.sub_shape(legion_dim_t(1), std::nullopt) ==
            logit.shape.sub_shape(legion_dim_t(1), std::nullopt));
@@ -85,21 +85,17 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
             "[SparseCategoricalCrossEntropyLoss] backward_time = %.2lfms\n",
             get_float_ptr(logit_grad),
             get_float_ptr(logit),
-            get_int32_ptr(label),
+            reinterpret_cast<int const *>(get_float_ptr(label)),
             get_volume(logit.shape),
             get_volume(logit_grad.shape),
-            num_samples,
+            batch_size,
             num_classes,
             k,
             scale_factor);
   } else {
     assert(logit.shape == label.shape);
     assert(logit_grad.shape == logit.shape);
-    // assertion the outter-most dim is replica dim and replica degree is 1
-    size_t ndim = logit.shape.num_dims();
-    assert(logit.shape.at(legion_dim_t(ndim - 1)) == 1);
-    int num_samples = label.shape.at(legion_dim_t(ndim - 1));
-    int num_channels = logit.shape.get_volume() / num_samples;
+    int num_channels = logit.shape.at(legion_dim_t{0});
     switch (loss_type) {
       case LossFunction::CATEGORICAL_CROSSENTROPY: {
         profile(categorical_crossentropy_loss_backward_kernel,
diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc
index a52ebb8089..502afb5f9f 100644
--- a/lib/local-execution/src/ops/element_unary.cc
+++ b/lib/local-execution/src/ops/element_unary.cc
@@ -34,7 +34,9 @@ OpTaskInvocation forward(ElementUnaryAttrs const &attrs) {
 
   b.bind(INPUT, input_tensor(0));
   b.bind(OUTPUT, output_tensor(0));
+  b.bind_arg(ATTRS, attrs);
 
+  b.bind_arg(HANDLE, ff_handle());
   b.bind_arg(PROFILING, profiling_settings());
   b.bind_arg(PER_DEVICE_STATE,
              per_device_op_state<ElementUnaryPerDeviceState>());
@@ -51,8 +53,8 @@ OpTaskInvocation backward(ElementUnaryAttrs const &attrs) {
 static DeviceSpecificDeviceStates
     init_task_impl(TaskArgumentAccessor const &acc) {
 
-  auto const &attrs = acc.get_argument<ElementUnaryAttrs>(ATTRS);
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  auto attrs = acc.get_argument<ElementUnaryAttrs>(ATTRS);
+
   ParallelTensorShape input_shape =
       acc.get_argument<ParallelTensorShape>(INPUT_SHAPE);
 
@@ -68,7 +70,7 @@ static DeviceSpecificDeviceStates
 static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  auto const &attrs = acc.get_argument<ElementUnaryAttrs>(ATTRS);
+  auto attrs = acc.get_argument<ElementUnaryAttrs>(ATTRS);
 
   auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
 
diff --git a/lib/local-execution/src/task_signature_impl.cc b/lib/local-execution/src/task_signature_impl.cc
index 16b7870601..3072b9a8bd 100644
--- a/lib/local-execution/src/task_signature_impl.cc
+++ b/lib/local-execution/src/task_signature_impl.cc
@@ -50,8 +50,8 @@ TaskSignatureAndImpl get_task_sig_impl(task_id_t const &task_id) {
       return TaskSignatureAndImpl{get_element_unary_fwd_task_impl(),
                                   get_element_unary_fwd_signature()};
     case task_id_t::ELEMENTUNARY_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_element_binary_bwd_task_impl(),
-                                  get_element_binary_bwd_signature()};
+      return TaskSignatureAndImpl{get_element_unary_bwd_task_impl(),
+                                  get_element_unary_bwd_signature()};
     case task_id_t::CONV2D_INIT_TASK_ID:
       return TaskSignatureAndImpl{get_conv_2d_init_task_impl(),
                                   get_conv_2d_init_signature()};
diff --git a/lib/local-execution/test/src/test_loss_function.cc b/lib/local-execution/test/src/test_loss_function.cc
new file mode 100644
index 0000000000..73ab02646e
--- /dev/null
+++ b/lib/local-execution/test/src/test_loss_function.cc
@@ -0,0 +1,88 @@
+#include "doctest/doctest.h"
+#include "kernels/local_cuda_allocator.h"
+#include "kernels/managed_per_device_ff_handle.h"
+#include "kernels/managed_ff_stream.h"
+#include "pcg/computation_graph_builder.h"
+#include "test_utils.h"
+#include "local-execution/local_training_backing.h"
+
+namespace FlexFlow {
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("Loss Function Local Execution") {
+    // initialize runtime configs
+    ManagedPerDeviceFFHandle managed_handle{};
+
+    RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
+      DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
+      EnableProfiling::NO,
+      ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/0}
+    };
+
+    // construct graph
+    ComputationGraphBuilder cg_builder;
+
+    size_t batch_size = 10;
+    size_t data_dim = 100;
+    TensorShape input_shape = TensorShape{TensorDims{FFOrdered<size_t>{batch_size, data_dim}}, DataType::FLOAT};
+    tensor_guid_t input_tensor = cg_builder.create_tensor(input_shape, CreateGrad::YES);
+
+    float scalar = 4.0;
+    tensor_guid_t logit_tensor = cg_builder.scalar_multiply(input_tensor, scalar);
+
+    // allocate memory
+    Allocator allocator = create_local_cuda_memory_allocator();
+    TensorBackingMap tensor_backing_map;
+    GenericTensorAccessorW input_backing = allocator.allocate_tensor(input_shape);
+    tensor_backing_map.insert({input_tensor, input_backing});
+
+    SUBCASE("SparseCategoricalCrossEntropyLossAttrs") {
+      TensorShape label_shape = TensorShape{TensorDims{FFOrdered<size_t>{batch_size, 1}}, DataType::FLOAT};
+      tensor_guid_t label_tensor = cg_builder.create_tensor(label_shape, CreateGrad::NO);
+      GenericTensorAccessorW label_backing = allocator.allocate_tensor(label_shape);
+      tensor_backing_map.insert({label_tensor, label_backing});
+      ModelTrainingInstance model_training_instance = ModelTrainingInstance{LossAttrs{SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}}, 
+                                                                            label_tensor, logit_tensor};
+      LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, model_training_instance);
+      local_backing.execute_init();
+      local_backing.execute_forward();
+      local_backing.execute_backward();
+    }
+
+    SUBCASE("OtherAttrs") {
+      tensor_guid_t label_tensor = cg_builder.create_tensor(input_shape, CreateGrad::NO);
+      GenericTensorAccessorW label_backing = allocator.allocate_tensor(input_shape);
+      tensor_backing_map.insert({label_tensor, label_backing});
+
+      SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") {
+        ModelTrainingInstance model_training_instance = ModelTrainingInstance{LossAttrs{OtherLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}, 
+                                                                              label_tensor, logit_tensor};
+        LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, model_training_instance);
+        local_backing.execute_init();
+        local_backing.execute_forward();
+        local_backing.execute_backward();
+      }
+
+      SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") {
+        ModelTrainingInstance model_training_instance = ModelTrainingInstance{LossAttrs{OtherLossAttrs{LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, 
+                                                                              label_tensor, logit_tensor};
+        LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, model_training_instance);
+        local_backing.execute_init();
+        local_backing.execute_forward();
+        local_backing.execute_backward();
+      }
+
+      SUBCASE("LossFunction::IDENTITY") {
+        ModelTrainingInstance model_training_instance = ModelTrainingInstance{LossAttrs{OtherLossAttrs{LossFunction::IDENTITY}}, 
+                                                                              label_tensor, logit_tensor};
+        LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, model_training_instance);
+        local_backing.execute_init();
+        local_backing.execute_forward();
+        local_backing.execute_backward();
+      }
+      
+    }
+  }
+}
+
+} // namespace FlexFlow

From f75a3d4c1cc85ae60d6254ddcabbb40b6f2338ad Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Tue, 27 Aug 2024 05:00:16 -0700
Subject: [PATCH 04/91] Format

---
 lib/kernels/src/array_shape.cc                |  6 --
 .../test/src/test_loss_function.cc            | 89 +++++++++++++------
 2 files changed, 61 insertions(+), 34 deletions(-)

diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
index bf80c6b5c1..69f04d6d34 100644
--- a/lib/kernels/src/array_shape.cc
+++ b/lib/kernels/src/array_shape.cc
@@ -50,12 +50,6 @@ std::size_t ArrayShape::at(ff_dim_t idx) const {
   return dims.at(legion_dim_from_ff_dim(idx, this->num_dims()));
 }
 
-// ArrayShape ArrayShape::sub_shape(
-//     std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
-//     std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const {
-//   NOT_IMPLEMENTED();
-// }
-
 ArrayShape ArrayShape::sub_shape(legion_dim_t start, ff_dim_t end) const {
   NOT_IMPLEMENTED();
 }
diff --git a/lib/local-execution/test/src/test_loss_function.cc b/lib/local-execution/test/src/test_loss_function.cc
index 73ab02646e..9e60c1b979 100644
--- a/lib/local-execution/test/src/test_loss_function.cc
+++ b/lib/local-execution/test/src/test_loss_function.cc
@@ -1,10 +1,10 @@
 #include "doctest/doctest.h"
 #include "kernels/local_cuda_allocator.h"
-#include "kernels/managed_per_device_ff_handle.h"
 #include "kernels/managed_ff_stream.h"
+#include "kernels/managed_per_device_ff_handle.h"
+#include "local-execution/local_training_backing.h"
 #include "pcg/computation_graph_builder.h"
 #include "test_utils.h"
-#include "local-execution/local_training_backing.h"
 
 namespace FlexFlow {
 
@@ -14,73 +14,106 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     ManagedPerDeviceFFHandle managed_handle{};
 
     RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
-      DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
-      EnableProfiling::NO,
-      ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/0}
-    };
+        DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
+        EnableProfiling::NO,
+        ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/0}};
 
     // construct graph
     ComputationGraphBuilder cg_builder;
 
     size_t batch_size = 10;
     size_t data_dim = 100;
-    TensorShape input_shape = TensorShape{TensorDims{FFOrdered<size_t>{batch_size, data_dim}}, DataType::FLOAT};
-    tensor_guid_t input_tensor = cg_builder.create_tensor(input_shape, CreateGrad::YES);
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered<size_t>{batch_size, data_dim}}, DataType::FLOAT};
+    tensor_guid_t input_tensor =
+        cg_builder.create_tensor(input_shape, CreateGrad::YES);
 
     float scalar = 4.0;
-    tensor_guid_t logit_tensor = cg_builder.scalar_multiply(input_tensor, scalar);
+    tensor_guid_t logit_tensor =
+        cg_builder.scalar_multiply(input_tensor, scalar);
 
     // allocate memory
     Allocator allocator = create_local_cuda_memory_allocator();
     TensorBackingMap tensor_backing_map;
-    GenericTensorAccessorW input_backing = allocator.allocate_tensor(input_shape);
+    GenericTensorAccessorW input_backing =
+        allocator.allocate_tensor(input_shape);
     tensor_backing_map.insert({input_tensor, input_backing});
 
     SUBCASE("SparseCategoricalCrossEntropyLossAttrs") {
-      TensorShape label_shape = TensorShape{TensorDims{FFOrdered<size_t>{batch_size, 1}}, DataType::FLOAT};
-      tensor_guid_t label_tensor = cg_builder.create_tensor(label_shape, CreateGrad::NO);
-      GenericTensorAccessorW label_backing = allocator.allocate_tensor(label_shape);
+      TensorShape label_shape = TensorShape{
+          TensorDims{FFOrdered<size_t>{batch_size, 1}}, DataType::FLOAT};
+      tensor_guid_t label_tensor =
+          cg_builder.create_tensor(label_shape, CreateGrad::NO);
+      GenericTensorAccessorW label_backing =
+          allocator.allocate_tensor(label_shape);
       tensor_backing_map.insert({label_tensor, label_backing});
-      ModelTrainingInstance model_training_instance = ModelTrainingInstance{LossAttrs{SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}}, 
-                                                                            label_tensor, logit_tensor};
-      LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, model_training_instance);
+      ModelTrainingInstance model_training_instance = ModelTrainingInstance{
+          LossAttrs{
+              SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}},
+          label_tensor,
+          logit_tensor};
+      LocalTrainingBacking local_backing(allocator,
+                                         cg_builder.computation_graph,
+                                         tensor_backing_map,
+                                         runtime_arg_config,
+                                         model_training_instance);
       local_backing.execute_init();
       local_backing.execute_forward();
       local_backing.execute_backward();
     }
 
     SUBCASE("OtherAttrs") {
-      tensor_guid_t label_tensor = cg_builder.create_tensor(input_shape, CreateGrad::NO);
-      GenericTensorAccessorW label_backing = allocator.allocate_tensor(input_shape);
+      tensor_guid_t label_tensor =
+          cg_builder.create_tensor(input_shape, CreateGrad::NO);
+      GenericTensorAccessorW label_backing =
+          allocator.allocate_tensor(input_shape);
       tensor_backing_map.insert({label_tensor, label_backing});
 
       SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") {
-        ModelTrainingInstance model_training_instance = ModelTrainingInstance{LossAttrs{OtherLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}, 
-                                                                              label_tensor, logit_tensor};
-        LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, model_training_instance);
+        ModelTrainingInstance model_training_instance = ModelTrainingInstance{
+            LossAttrs{OtherLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}},
+            label_tensor,
+            logit_tensor};
+        LocalTrainingBacking local_backing(allocator,
+                                           cg_builder.computation_graph,
+                                           tensor_backing_map,
+                                           runtime_arg_config,
+                                           model_training_instance);
         local_backing.execute_init();
         local_backing.execute_forward();
         local_backing.execute_backward();
       }
 
       SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") {
-        ModelTrainingInstance model_training_instance = ModelTrainingInstance{LossAttrs{OtherLossAttrs{LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, 
-                                                                              label_tensor, logit_tensor};
-        LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, model_training_instance);
+        ModelTrainingInstance model_training_instance = ModelTrainingInstance{
+            LossAttrs{
+                OtherLossAttrs{LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}},
+            label_tensor,
+            logit_tensor};
+        LocalTrainingBacking local_backing(allocator,
+                                           cg_builder.computation_graph,
+                                           tensor_backing_map,
+                                           runtime_arg_config,
+                                           model_training_instance);
         local_backing.execute_init();
         local_backing.execute_forward();
         local_backing.execute_backward();
       }
 
       SUBCASE("LossFunction::IDENTITY") {
-        ModelTrainingInstance model_training_instance = ModelTrainingInstance{LossAttrs{OtherLossAttrs{LossFunction::IDENTITY}}, 
-                                                                              label_tensor, logit_tensor};
-        LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, model_training_instance);
+        ModelTrainingInstance model_training_instance = ModelTrainingInstance{
+            LossAttrs{OtherLossAttrs{LossFunction::IDENTITY}},
+            label_tensor,
+            logit_tensor};
+        LocalTrainingBacking local_backing(allocator,
+                                           cg_builder.computation_graph,
+                                           tensor_backing_map,
+                                           runtime_arg_config,
+                                           model_training_instance);
         local_backing.execute_init();
         local_backing.execute_forward();
         local_backing.execute_backward();
       }
-      
     }
   }
 }

From f74711fb71685ef95c10770646e39fdf3acd27a0 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Tue, 27 Aug 2024 12:30:16 -0700
Subject: [PATCH 05/91] Refactor and build optimizer kernels, op

---
 lib/kernels/CMakeLists.txt                    |   1 +
 lib/kernels/include/kernels/array_shape.h     |   3 +
 .../include/kernels/optimizer_kernels.h       |   3 +
 lib/kernels/src/array_shape.cc                |   8 +
 lib/kernels/src/cuda/optimizer_kernels.cu     | 167 +++++++-------
 .../include/local-execution/loss_functions.h  |   4 +-
 .../include/local-execution/optimizer.h       |  22 ++
 .../src/local_training_backing.cc             |   9 +-
 lib/local-execution/src/optimizer.cc          | 205 ++++++++++++++++++
 lib/pcg/include/pcg/optimizer_attrs.h         |  14 --
 .../include/pcg/optimizer_attrs.variant.toml  |  23 ++
 .../adam_optimizer_attrs.struct.toml          |   4 +
 12 files changed, 370 insertions(+), 93 deletions(-)
 create mode 100644 lib/local-execution/include/local-execution/optimizer.h
 create mode 100644 lib/local-execution/src/optimizer.cc
 delete mode 100644 lib/pcg/include/pcg/optimizer_attrs.h
 create mode 100644 lib/pcg/include/pcg/optimizer_attrs.variant.toml

diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt
index baac58f8e3..5a6a0d1357 100644
--- a/lib/kernels/CMakeLists.txt
+++ b/lib/kernels/CMakeLists.txt
@@ -9,6 +9,7 @@ file(GLOB_RECURSE SRC
      src/*.cc
      src/cuda/cuda_helper.cu
      src/cuda/loss_function_kernels.cu
+     src/cuda/optimizer_kernels.cu
      src/cuda/ops/*.cu
      )
 
diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h
index c95c447574..6b0b57b57f 100644
--- a/lib/kernels/include/kernels/array_shape.h
+++ b/lib/kernels/include/kernels/array_shape.h
@@ -50,6 +50,9 @@ struct ArrayShape {
   ArrayShape sub_shape(std::optional<legion_dim_t> start,
                        std::optional<legion_dim_t> end) const;
 
+  bool operator==(ArrayShape const &) const;
+  bool operator!=(ArrayShape const &) const;
+
 public:
   LegionTensorDims dims;
 };
diff --git a/lib/kernels/include/kernels/optimizer_kernels.h b/lib/kernels/include/kernels/optimizer_kernels.h
index fcbf9454f8..ed7c2778dd 100644
--- a/lib/kernels/include/kernels/optimizer_kernels.h
+++ b/lib/kernels/include/kernels/optimizer_kernels.h
@@ -34,6 +34,8 @@ void adam_ps_update_task_gpu(ffStream_t,
                              float beta2,
                              float weight_decay,
                              float epsilon,
+                             size_t size,
+                             int num_replicas,
                              float const *weight_grad_ptr,
                              float *adam_m_ptr,
                              float *adam_v_ptr,
@@ -45,6 +47,7 @@ void adam_nccl_update_task_gpu(ffStream_t,
                                float beta2,
                                float weight_decay,
                                float epsilon,
+                               size_t size,
                                PerDeviceFFHandle const &,
                                float const *weight_grad_ptr,
                                float *adam_m_ptr,
diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
index 69f04d6d34..ddfa3964e3 100644
--- a/lib/kernels/src/array_shape.cc
+++ b/lib/kernels/src/array_shape.cc
@@ -101,6 +101,14 @@ TensorShape get_tensor_shape(ArrayShape const &shape, DataType dtype) {
                      dtype};
 }
 
+bool ArrayShape::operator==(ArrayShape const & other) const {
+  return this->dims == other.dims;
+}
+
+bool ArrayShape::operator!=(ArrayShape const & other) const {
+  return this->dims != other.dims;
+}
+
 std::string format_as(ArrayShape const &x) {
   std::ostringstream oss;
   oss << "<ArrayShape";
diff --git a/lib/kernels/src/cuda/optimizer_kernels.cu b/lib/kernels/src/cuda/optimizer_kernels.cu
index 1bb38b2870..7d1d720ba0 100644
--- a/lib/kernels/src/cuda/optimizer_kernels.cu
+++ b/lib/kernels/src/cuda/optimizer_kernels.cu
@@ -15,6 +15,7 @@
 
 #include "device.h"
 #include "kernels/optimizer_kernels.h"
+#include "kernels/nccl.h"
 
 namespace FlexFlow {
 
@@ -41,52 +42,58 @@ __global__ void sgd_update(size_t count,
   }
 }
 
-__host__ void SGDOptimizer::ps_update_task_gpu(SGDOptimizer const *op,
-                                               float const *w_grad_ptr,
-                                               size_t size,
-                                               int num_replicas,
-                                               float *w_ptr,
-                                               float *v_ptr) {
-  cudaStream_t stream;
+void sgd_ps_update_task_gpu(cudaStream_t stream,
+                            float lr,
+                            float momentum,
+                            bool nesterov,
+                            float weight_decay,
+                            float const *weight_grad_ptr,
+                            size_t size,
+                            int num_replicas,
+                            float *weight_ptr,
+                            float *sgd_v_ptr) {
   checkCUDA(get_legion_stream(&stream));
   // Step 1: Gather gradients in the first replica
   for (int i = 1; i < num_replicas; i++) {
-    float const *src = w_grad_ptr + i * size;
+    float const *src = weight_grad_ptr + i * size;
     apply_add_with_scale<float>
         <<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-            (float *)w_grad_ptr, src, size, 1.0f);
+            (float *)weight_grad_ptr, src, size, 1.0f);
   }
   // checkCUDA(cudaDeviceSynchronize());
   //  Step 2: SGD update
   sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
       size,
-      op->lr,
-      op->weight_decay,
-      op->momentum,
-      op->nesterov,
-      w_grad_ptr,
-      v_ptr,
-      w_ptr);
+      lr,
+      weight_decay,
+      momentum,
+      nesterov,
+      weight_grad_ptr,
+      sgd_v_ptr,
+      weight_ptr);
   // checkCUDA(cudaDeviceSynchronize());
 }
 
 #ifdef FF_USE_NCCL
-__host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
-                                                 PerDeviceOpState const *meta,
-                                                 float const *w_grad_ptr,
-                                                 size_t size,
-                                                 float *w_ptr,
-                                                 float *v_ptr) {
+void sgd_nccl_update_task_gpu(cudaStream_t stream,
+                              float lr,
+                              float momentum,
+                              bool nesterov,
+                              float weight_decay,
+                              PerDeviceFFHandle const & handle,
+                              float const *weight_grad_ptr,
+                              size_t size,
+                              float *weight_ptr,
+                              float *sgd_v_ptr) {
   // Use NCCL to sync gradients
   // fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr);
-  cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  checkNCCL(ncclAllReduce(w_grad_ptr,
-                          (float *)w_grad_ptr,
+  checkNCCL(ncclAllReduce(weight_grad_ptr,
+                          (float *)weight_grad_ptr,
                           size,
-                          ncclFloat,
-                          ncclSum,
-                          meta->handle.ncclComm,
+                          ncclDataType_t::ncclFloat,
+                          ncclRedOp_t::ncclSum,
+                          handle.ncclComm,
                           stream));
   // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr);
   // print_tensor<float>((float*)w_grad_ptr, 16, "[After ncclAllReduce]");
@@ -94,13 +101,13 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
   // Step 2: SGD update
   sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
       size,
-      op->lr,
-      op->weight_decay,
-      op->momentum,
-      op->nesterov,
-      w_grad_ptr,
-      v_ptr,
-      w_ptr);
+      lr,
+      weight_decay,
+      momentum,
+      nesterov,
+      weight_grad_ptr,
+      sgd_v_ptr,
+      weight_ptr);
   // checkCUDA(cudaDeviceSynchronize());
 }
 #endif
@@ -145,20 +152,24 @@ __global__ void adam_update(int count,
   }
 }
 
-__host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op,
-                                                float const *w_grad_ptr,
-                                                size_t size,
-                                                int num_replicas,
-                                                float *w_ptr,
-                                                float *v_ptr,
-                                                float *m_ptr) {
-  cudaStream_t stream;
+void adam_ps_update_task_gpu(cudaStream_t stream,
+                             float alpha_t,
+                             float beta1,
+                             float beta2,
+                             float weight_decay,
+                             float epsilon,
+                             size_t size,
+                             int num_replicas,
+                             float const *weight_grad_ptr,
+                             float *adam_m_ptr,
+                             float *adam_v_ptr,
+                             float *weight_ptr) {
   checkCUDA(get_legion_stream(&stream));
   // Step 1: Gather gradients in the first replica
   for (int i = 1; i < num_replicas; i++) {
-    float const *src = w_grad_ptr + i * size;
+    float const *src = weight_grad_ptr + i * size;
     add_kernel<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-        size, 1.0f, src, (float *)w_grad_ptr);
+        size, 1.0f, src, (float *)weight_grad_ptr);
   }
   // checkCUDA(cudaDeviceSynchronize());
   // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n",
@@ -166,50 +177,54 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op,
   //  Step 2: Adam update
   adam_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
       size,
-      op->alpha_t,
-      op->beta1,
-      op->beta2,
-      op->weight_decay,
-      op->epsilon,
-      w_grad_ptr,
-      m_ptr,
-      v_ptr,
-      w_ptr);
+      alpha_t,
+      beta1,
+      beta2,
+      weight_decay,
+      epsilon,
+      weight_grad_ptr,
+      adam_m_ptr,
+      adam_v_ptr,
+      weight_ptr);
   // checkCUDA(cudaDeviceSynchronize());
 }
 
 #ifdef FF_USE_NCCL
-__host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
-                                                  PerDeviceOpState const *meta,
-                                                  float const *w_grad_ptr,
-                                                  size_t size,
-                                                  float *w_ptr,
-                                                  float *v_ptr,
-                                                  float *m_ptr) {
+void adam_nccl_update_task_gpu(cudaStream_t stream,
+                               float alpha_t,
+                               float beta1,
+                               float beta2,
+                               float weight_decay,
+                               float epsilon,
+                               size_t size,
+                               PerDeviceFFHandle const & handle,
+                               float const *weight_grad_ptr,
+                               float *adam_m_ptr,
+                               float *adam_v_ptr,
+                               float *weight_ptr) {
   // Use NCCL to sync gradients
-  cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  checkNCCL(ncclAllReduce(w_grad_ptr,
-                          (float *)w_grad_ptr,
+  checkNCCL(ncclAllReduce(weight_grad_ptr,
+                          (float *)weight_grad_ptr,
                           size,
-                          ncclFloat,
-                          ncclSum,
-                          meta->handle.ncclComm,
+                          ncclDataType_t::ncclFloat,
+                          ncclRedOp_t::ncclSum,
+                          handle.ncclComm,
                           stream));
   // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n",
   //         op->alpha, op->alpha_t, op->weight_decay);
   //  Step 2: Adam update
   adam_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
       size,
-      op->alpha_t,
-      op->beta1,
-      op->beta2,
-      op->weight_decay,
-      op->epsilon,
-      w_grad_ptr,
-      m_ptr,
-      v_ptr,
-      w_ptr);
+      alpha_t,
+      beta1,
+      beta2,
+      weight_decay,
+      epsilon,
+      weight_grad_ptr,
+      adam_m_ptr,
+      adam_v_ptr,
+      weight_ptr);
   // checkCUDA(cudaDeviceSynchronize());
 }
 #endif
diff --git a/lib/local-execution/include/local-execution/loss_functions.h b/lib/local-execution/include/local-execution/loss_functions.h
index e5e81b60a7..58405536d8 100644
--- a/lib/local-execution/include/local-execution/loss_functions.h
+++ b/lib/local-execution/include/local-execution/loss_functions.h
@@ -13,8 +13,8 @@
  * limitations under the License.
  */
 
-#ifndef _FLEXFLOW_LOSS_FUNCTIONS_H_
-#define _FLEXFLOW_LOSS_FUNCTIONS_H_
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_
+#define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_
 
 #include "local-execution/task_impl_function.dtg.h"
 #include "local-execution/task_invocation.h"
diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h
new file mode 100644
index 0000000000..4702352568
--- /dev/null
+++ b/lib/local-execution/include/local-execution/optimizer.h
@@ -0,0 +1,22 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_
+#define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_
+
+#include "local-execution/task_impl_function.dtg.h"
+#include "local-execution/task_invocation.h"
+#include "local-execution/task_signature.h"
+#include "pcg/optimizers/sgd_optimizer_attrs.dtg.h"
+#include "pcg/optimizers/adam_optimizer_attrs.dtg.h"
+
+namespace FlexFlow {
+
+TaskSignature get_sgd_update_signature();
+TaskInvocation sgd_update(SGDOptimizerAttrs const &);
+TaskImplFunction get_sgd_update_task_impl();
+
+TaskSignature get_adam_update_signature();
+TaskInvocation adam_update(SGDOptimizerAttrs const &);
+TaskImplFunction get_adam_update_task_impl();
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index 98bfe7683e..c8f5f279d2 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -4,6 +4,7 @@
 #include "utils/containers/contains.h"
 #include "utils/containers/contains_key.h"
 #include "utils/containers/reversed.h"
+#include "utils/containers/get_only.h"
 #include "utils/exception.h"
 
 namespace FlexFlow {
@@ -136,7 +137,13 @@ PerLayerElapsedTime LocalTrainingBacking::execute_backward() {
 }
 
 void LocalTrainingBacking::execute_update() {
-  NOT_IMPLEMENTED();
+  for (layer_guid_t const &node: topological_ordering(this->computation_graph)) {
+    LayerAttrs layer_attrs = get_layer_attrs(this->computation_graph, node);
+    if (layer_attrs.attrs.has<WeightAttrs>()) {
+      tensor_guid_t weight_tensor = get_only(get_outgoing_tensors(this->computation_graph, node));
+      // TODO: handle momentum vectors separately? handle different updates?
+    }
+  }
 }
 
 TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor(
diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc
new file mode 100644
index 0000000000..2f45802978
--- /dev/null
+++ b/lib/local-execution/src/optimizer.cc
@@ -0,0 +1,205 @@
+#include "kernels/optimizer_kernels.h"
+#include "local-execution/optimizer.h"
+#include "local-execution/profiling.h"
+
+namespace FlexFlow {
+
+enum Slots {
+  ATTRS,
+  WEIGHT,
+  SGD_V,
+  PROFILING,
+  ADAM_M,
+  ADAM_V,
+  HANDLE
+};
+
+TaskSignature get_sgd_update_signature() {
+  TaskSignature sig = make_empty_task_signature();
+  add_slot(sig, WEIGHT, IsGrad::YES);
+  add_slot(sig, WEIGHT, IsGrad::NO);
+  add_slot(sig, SGD_V, IsGrad::YES);
+  add_arg_slot<SGDOptimizerAttrs>(sig, ATTRS);
+  add_arg_slot<ProfilingSettings>(sig, PROFILING);
+  if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
+    add_unchecked_arg_slot<PerDeviceFFHandle>(sig, HANDLE);
+  }
+  return sig;
+}
+
+TaskInvocation sgd_update(SGDOptimizerAttrs const & attrs,
+                          tensor_guid_t const & weight,
+                          tensor_guid_t const & sgd_v) {
+  TaskBinding b;
+  b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::YES});
+  b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::NO});
+  if (attrs.momentum > 0.0f) {
+    b.bind(SGD_V, TensorGuidSpec{sgd_v, IsGrad::YES});
+  }
+  b.bind_arg(ATTRS, attrs);
+  b.bind_arg(PROFILING, profiling_settings());
+
+  if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
+    b.bind_arg(HANDLE, ff_handle());
+    return {task_id_t::SGD_UPD_NCCL_TASK_ID, b};
+  }
+  return {task_id_t::SGD_UPD_PS_TASK_ID, b};
+}
+
+static void sgd_update_task_impl(TaskArgumentAccessor const & acc) {
+  auto attrs = acc.get_argument<SGDOptimizerAttrs>(ATTRS);
+  auto weight_grad = acc.get_tensor_grad<Permissions::RO>(WEIGHT);
+  auto weight = acc.get_tensor<Permissions::RW>(WEIGHT);
+  auto profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+
+  assert (weight.shape == weight_grad.shape);
+  size_t size = weight_grad.shape.get_volume();
+
+  assert (weight_grad.shape.get_volume() & weight.shape.get_volume() == 0);
+  size_t num_replicas = weight_grad.shape.get_volume() / weight.shape.get_volume();
+
+  float *sgd_v_ptr;
+  if (attrs.momentum > 0.0f) {
+    auto sgd_v = acc.get_tensor<Permissions::RW>(SGD_V);
+    assert (sgd_v.shape == weight.shape);
+    sgd_v_ptr = sgd_v.get_float_ptr();
+  }
+
+  if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
+    auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+    profile(sgd_nccl_update_task_gpu,
+          profiling,
+          "[SGD NCCL] update_time = %.2lfms\n",
+          attrs.lr,
+          attrs.momentum,
+          attrs.nesterov,
+          attrs.weight_decay,
+          handle,
+          weight_grad.get_float_ptr(),
+          size,
+          weight.get_float_ptr(),
+          sgd_v_ptr);
+
+  } else {
+    profile(sgd_ps_update_task_gpu,
+          profiling,
+          "[SGD PS] update_time = %.2lfms\n",
+          attrs.lr,
+          attrs.momentum,
+          attrs.nesterov,
+          attrs.weight_decay,
+          weight_grad.get_float_ptr(),
+          size,
+          num_replicas,
+          weight.get_float_ptr(),
+          sgd_v_ptr);
+  }
+}
+
+TaskImplFunction get_sgd_update_task_impl() {
+  return TaskImplFunction{GenericTaskImplFunction{sgd_update_task_impl}};
+}
+
+TaskSignature get_adam_update_signature() {
+  TaskSignature sig = make_empty_task_signature();
+  add_slot(sig, WEIGHT, IsGrad::YES);
+  add_slot(sig, WEIGHT, IsGrad::NO);
+  add_slot(sig, ADAM_V, IsGrad::YES);
+  add_slot(sig, ADAM_M, IsGrad::YES);
+  add_arg_slot<AdamOptimizerAttrs>(sig, ATTRS);
+  add_arg_slot<ProfilingSettings>(sig, PROFILING);
+  if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
+    add_unchecked_arg_slot<PerDeviceFFHandle>(sig, HANDLE);
+  }
+  return sig;
+}
+
+TaskInvocation adam_update(AdamOptimizerAttrs const & attrs,
+                           tensor_guid_t const & weight,
+                           tensor_guid_t const & adam_v,
+                           tensor_guid_t const & adam_m) {
+  TaskBinding b;
+  b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::YES});
+  b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::NO});
+  b.bind(ADAM_M, TensorGuidSpec{adam_m, IsGrad::YES});
+  b.bind(ADAM_V, TensorGuidSpec{adam_v, IsGrad::YES});
+  b.bind_arg(ATTRS, attrs);
+  b.bind_arg(PROFILING, profiling_settings());
+
+  if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
+    b.bind_arg(HANDLE, ff_handle());
+    return {task_id_t::ADAM_UPD_NCCL_TASK_ID, b};
+  }
+  return {task_id_t::ADAM_UPD_PS_TASK_ID, b};
+}
+
+static void adam_update_task_impl(TaskArgumentAccessor const & acc) {
+  auto attrs = acc.get_argument<AdamOptimizerAttrs>(ATTRS);
+  auto weight_grad = acc.get_tensor_grad<Permissions::RO>(WEIGHT);
+  auto weight = acc.get_tensor<Permissions::RW>(WEIGHT);
+  auto v_tensor = acc.get_tensor<Permissions::RW>(ADAM_V);
+  auto m_tensor = acc.get_tensor<Permissions::RW>(ADAM_M);
+
+  auto profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+
+  assert (weight.shape == weight_grad.shape);
+  size_t size = weight_grad.shape.get_volume();
+
+  assert (weight_grad.shape.get_volume() % weight.shape.get_volume() == 0);
+  size_t num_replicas = weight_grad.shape.get_volume() / weight.shape.get_volume();
+
+  if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
+    auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+    profile(adam_nccl_update_task_gpu,
+            profiling,
+            "[Adam NCCL] update_time = %.2lfms\n",
+            attrs.alpha_t,
+            attrs.beta1,
+            attrs.beta2,
+            attrs.weight_decay,
+            attrs.epsilon,
+            size,
+            handle,
+            weight_grad.get_float_ptr(),
+            m_tensor.get_float_ptr(),
+            v_tensor.get_float_ptr(),
+            weight.get_float_ptr());
+  } else {
+    profile(adam_ps_update_task_gpu,
+            profiling,
+            "[Adam NCCL] update_time = %.2lfms\n",
+            attrs.alpha_t,
+            attrs.beta1,
+            attrs.beta2,
+            attrs.weight_decay,
+            attrs.epsilon,
+            size,
+            num_replicas,
+            weight_grad.get_float_ptr(),
+            m_tensor.get_float_ptr(),
+            v_tensor.get_float_ptr(),
+            weight.get_float_ptr());
+  }
+}
+
+AdamOptimizerAttrs next(AdamOptimizerAttrs const & old) {
+  double new_beta1_t = old.beta_t * old.beta1;
+  double new_beta2_t = old.beta2_t * old.beta2;
+  double new_alpha_t = old.alpha * sqrt(1 - new_beta2_t) / (1 - new_beta1_t);
+  return AdamOptimizerAttrs{
+    old.alpha,
+    old.beta1,
+    old.beta2,
+    old.weight_decay,
+    new_alpha_t,
+    new_beta1_t,
+    new_beta2_t,
+    old.epsilon
+  };
+}
+
+TaskImplFunction get_adam_update_task_impl() {
+  return TaskImplFunction{GenericTaskImplFunction{adam_update_task_impl}};
+}
+
+}
diff --git a/lib/pcg/include/pcg/optimizer_attrs.h b/lib/pcg/include/pcg/optimizer_attrs.h
deleted file mode 100644
index 4bac74b999..0000000000
--- a/lib/pcg/include/pcg/optimizer_attrs.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef _FLEXFLOW_PCG_INCLUDE_PCG_OPTIMIZER_H
-#define _FLEXFLOW_PCG_INCLUDE_PCG_OPTIMIZER_H
-
-#include "pcg/optimizers/adam_optimizer_attrs.h"
-#include "pcg/optimizers/sgd_optimizer_attrs.h"
-#include "utils/variant.h"
-
-namespace FlexFlow {
-
-using OptimizerAttrs = std::variant<SGDOptimizerAttrs, AdamOptimizerAttrs>;
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/pcg/include/pcg/optimizer_attrs.variant.toml b/lib/pcg/include/pcg/optimizer_attrs.variant.toml
new file mode 100644
index 0000000000..585c150700
--- /dev/null
+++ b/lib/pcg/include/pcg/optimizer_attrs.variant.toml
@@ -0,0 +1,23 @@
+namespace = "FlexFlow"
+name = "OptimizerAttrs"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "json",
+  "fmt",
+  "rapidcheck",
+]
+
+includes = [
+  "pcg/optimizers/sgd_optimizer_attrs.dtg.h",
+  "pcg/optimizers/adam_optimizer_attrs.dtg.h",
+]
+
+[[values]]
+type = "::FlexFlow::SGDOptimizerAttrs"
+key = "sgd_optimizer"
+
+[[values]]
+type = "::FlexFlow::AdamOptimizerAttrs"
+key = "adam_optimizer"
diff --git a/lib/pcg/include/pcg/optimizers/adam_optimizer_attrs.struct.toml b/lib/pcg/include/pcg/optimizers/adam_optimizer_attrs.struct.toml
index fd3e83cc4a..c25baa6c89 100644
--- a/lib/pcg/include/pcg/optimizers/adam_optimizer_attrs.struct.toml
+++ b/lib/pcg/include/pcg/optimizers/adam_optimizer_attrs.struct.toml
@@ -36,3 +36,7 @@ type = "double"
 [[fields]]
 name = "beta2_t"
 type = "double"
+
+[[fields]]
+name = "epsilon"
+type = "double"

From 40c62526336ffbbee069988126047dcdad64a1ce Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Tue, 27 Aug 2024 13:40:10 -0700
Subject: [PATCH 06/91] Finish optimizer local backing

---
 lib/kernels/src/array_shape.cc                |   4 +-
 lib/kernels/src/cuda/optimizer_kernels.cu     |  40 +++--
 .../local-execution/local_slots_backing.h     |  11 +-
 .../local-execution/local_training_backing.h  |   3 +-
 .../local-execution/model_training_instance.h |  13 ++
 .../model_training_instance.struct.toml       |   5 +
 .../include/local-execution/optimizer.h       |  18 ++-
 .../src/local_cost_estimator.cc               |   3 +-
 .../src/local_slots_backing.cc                |  21 +++
 .../src/local_training_backing.cc             |  49 ++++++-
 .../src/model_training_instance.cc            |  26 ++++
 lib/local-execution/src/optimizer.cc          | 137 ++++++++++--------
 .../test/src/test_loss_function.cc            |  47 +++---
 lib/pcg/include/pcg/computation_graph.h       |   4 +
 lib/pcg/include/pcg/optimizer_attrs.h         |  13 ++
 lib/pcg/src/pcg/computation_graph.cc          |  13 ++
 lib/pcg/src/pcg/optimizer_attrs.cc            |  14 ++
 17 files changed, 300 insertions(+), 121 deletions(-)
 create mode 100644 lib/local-execution/include/local-execution/model_training_instance.h
 create mode 100644 lib/local-execution/src/model_training_instance.cc
 create mode 100644 lib/pcg/include/pcg/optimizer_attrs.h
 create mode 100644 lib/pcg/src/pcg/optimizer_attrs.cc

diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
index ddfa3964e3..054e16e90a 100644
--- a/lib/kernels/src/array_shape.cc
+++ b/lib/kernels/src/array_shape.cc
@@ -101,11 +101,11 @@ TensorShape get_tensor_shape(ArrayShape const &shape, DataType dtype) {
                      dtype};
 }
 
-bool ArrayShape::operator==(ArrayShape const & other) const {
+bool ArrayShape::operator==(ArrayShape const &other) const {
   return this->dims == other.dims;
 }
 
-bool ArrayShape::operator!=(ArrayShape const & other) const {
+bool ArrayShape::operator!=(ArrayShape const &other) const {
   return this->dims != other.dims;
 }
 
diff --git a/lib/kernels/src/cuda/optimizer_kernels.cu b/lib/kernels/src/cuda/optimizer_kernels.cu
index 7d1d720ba0..2eaf30b21f 100644
--- a/lib/kernels/src/cuda/optimizer_kernels.cu
+++ b/lib/kernels/src/cuda/optimizer_kernels.cu
@@ -14,8 +14,8 @@
  */
 
 #include "device.h"
-#include "kernels/optimizer_kernels.h"
 #include "kernels/nccl.h"
+#include "kernels/optimizer_kernels.h"
 
 namespace FlexFlow {
 
@@ -62,15 +62,14 @@ void sgd_ps_update_task_gpu(cudaStream_t stream,
   }
   // checkCUDA(cudaDeviceSynchronize());
   //  Step 2: SGD update
-  sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-      size,
-      lr,
-      weight_decay,
-      momentum,
-      nesterov,
-      weight_grad_ptr,
-      sgd_v_ptr,
-      weight_ptr);
+  sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(size,
+                                                                lr,
+                                                                weight_decay,
+                                                                momentum,
+                                                                nesterov,
+                                                                weight_grad_ptr,
+                                                                sgd_v_ptr,
+                                                                weight_ptr);
   // checkCUDA(cudaDeviceSynchronize());
 }
 
@@ -80,7 +79,7 @@ void sgd_nccl_update_task_gpu(cudaStream_t stream,
                               float momentum,
                               bool nesterov,
                               float weight_decay,
-                              PerDeviceFFHandle const & handle,
+                              PerDeviceFFHandle const &handle,
                               float const *weight_grad_ptr,
                               size_t size,
                               float *weight_ptr,
@@ -99,15 +98,14 @@ void sgd_nccl_update_task_gpu(cudaStream_t stream,
   // print_tensor<float>((float*)w_grad_ptr, 16, "[After ncclAllReduce]");
 
   // Step 2: SGD update
-  sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-      size,
-      lr,
-      weight_decay,
-      momentum,
-      nesterov,
-      weight_grad_ptr,
-      sgd_v_ptr,
-      weight_ptr);
+  sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(size,
+                                                                lr,
+                                                                weight_decay,
+                                                                momentum,
+                                                                nesterov,
+                                                                weight_grad_ptr,
+                                                                sgd_v_ptr,
+                                                                weight_ptr);
   // checkCUDA(cudaDeviceSynchronize());
 }
 #endif
@@ -197,7 +195,7 @@ void adam_nccl_update_task_gpu(cudaStream_t stream,
                                float weight_decay,
                                float epsilon,
                                size_t size,
-                               PerDeviceFFHandle const & handle,
+                               PerDeviceFFHandle const &handle,
                                float const *weight_grad_ptr,
                                float *adam_m_ptr,
                                float *adam_v_ptr,
diff --git a/lib/local-execution/include/local-execution/local_slots_backing.h b/lib/local-execution/include/local-execution/local_slots_backing.h
index 1f35bdd304..439113c873 100644
--- a/lib/local-execution/include/local-execution/local_slots_backing.h
+++ b/lib/local-execution/include/local-execution/local_slots_backing.h
@@ -1,6 +1,6 @@
 
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_SLOT_REGISTRY_H
-#define _FLEXFLOW_LOCAL_EXECUTION_SLOT_REGISTRY_H
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_SLOTS_BACKING_H
+#define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_SLOTS_BACKING_H
 
 #include "kernels/accessor.h"
 #include "local-execution/local_task_argument_accessor.h"
@@ -23,6 +23,11 @@ struct LocalSlotsBacking {
   void allocate_outgoing_tensors(layer_guid_t const &,
                                  ComputationGraph const &,
                                  Allocator &);
+  void allocate_optimizer_tensors(layer_guid_t const &weight_layer,
+                                  tensor_guid_t const &,
+                                  ComputationGraph const &,
+                                  Allocator &,
+                                  TaskSignature const &);
   TensorSlotsBacking construct_tensor_slots_backing(OpTaskBinding const &,
                                                     layer_guid_t const &) const;
   TensorSlotsBacking construct_tensor_slots_backing(TaskBinding const &) const;
@@ -48,6 +53,8 @@ struct LocalSlotsBacking {
       input_tensor_slots;
   std::unordered_map<layer_guid_t, std::vector<tensor_guid_t>>
       output_tensor_slots;
+  std::unordered_map<tensor_guid_t, std::vector<tensor_guid_t>>
+      weight_optimizer_tensor_guids;
 
   // arguments
   std::unordered_map<layer_guid_t, DeviceSpecificDeviceStates>
diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h
index 55983086c2..d2586038f0 100644
--- a/lib/local-execution/include/local-execution/local_training_backing.h
+++ b/lib/local-execution/include/local-execution/local_training_backing.h
@@ -4,7 +4,6 @@
 #include "local-execution/local_slots_backing.h"
 #include "local-execution/model_training_instance.dtg.h"
 #include "local-execution/task_registry.h"
-#include "op-attrs/ops/loss_functions.h"
 
 namespace FlexFlow {
 
@@ -16,7 +15,7 @@ struct LocalTrainingBacking {
                        ComputationGraph const &,
                        TensorBackingMap const &,
                        RuntimeArgConfig const &,
-                       std::optional<ModelTrainingInstance> const &);
+                       std::optional<ModelTrainingInstance> &);
 
   void execute_init();
   PerLayerElapsedTime execute_forward();
diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h
new file mode 100644
index 0000000000..7ea027a636
--- /dev/null
+++ b/lib/local-execution/include/local-execution/model_training_instance.h
@@ -0,0 +1,13 @@
+
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_MODEL_TRAINING_INSTANCE_H
+#define _FLEXFLOW_LOCAL_EXECUTION_MODEL_TRAINING_INSTANCE_H
+
+#include "local-execution/model_training_instance.dtg.h"
+
+namespace FlexFlow {
+  
+ModelTrainingInstance next(ModelTrainingInstance const & old);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/local-execution/include/local-execution/model_training_instance.struct.toml b/lib/local-execution/include/local-execution/model_training_instance.struct.toml
index ea7e8d24ab..e3ff397e39 100644
--- a/lib/local-execution/include/local-execution/model_training_instance.struct.toml
+++ b/lib/local-execution/include/local-execution/model_training_instance.struct.toml
@@ -11,6 +11,7 @@ includes = [
   "utils/optional.h",
   "op-attrs/ops/loss_attrs.dtg.h",
   "pcg/tensor_guid_t.dtg.h",
+  "pcg/optimizer_attrs.dtg.h",
 ]
 
 [[fields]]
@@ -24,3 +25,7 @@ type = "::FlexFlow::tensor_guid_t"
 [[fields]]
 name = "logit_tensor"
 type = "::FlexFlow::tensor_guid_t"
+
+[[fields]]
+name = "optimizer_attrs"
+type = "::FlexFlow::OptimizerAttrs"
diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h
index 4702352568..53dcad63de 100644
--- a/lib/local-execution/include/local-execution/optimizer.h
+++ b/lib/local-execution/include/local-execution/optimizer.h
@@ -4,17 +4,29 @@
 #include "local-execution/task_impl_function.dtg.h"
 #include "local-execution/task_invocation.h"
 #include "local-execution/task_signature.h"
-#include "pcg/optimizers/sgd_optimizer_attrs.dtg.h"
+#include "pcg/optimizer_attrs.dtg.h"
 #include "pcg/optimizers/adam_optimizer_attrs.dtg.h"
+#include "pcg/optimizers/sgd_optimizer_attrs.dtg.h"
 
 namespace FlexFlow {
 
+TaskSignature get_update_signature(OptimizerAttrs const &);
+TaskInvocation get_update_invocation(OptimizerAttrs const &,
+                                     tensor_guid_t const &weight,
+                                     std::vector<tensor_guid_t> const &);
+TaskImplFunction get_update_task_impl(OptimizerAttrs const &);
+
 TaskSignature get_sgd_update_signature();
-TaskInvocation sgd_update(SGDOptimizerAttrs const &);
+TaskInvocation sgd_update(SGDOptimizerAttrs const &,
+                          tensor_guid_t const &weight,
+                          tensor_guid_t const &);
 TaskImplFunction get_sgd_update_task_impl();
 
 TaskSignature get_adam_update_signature();
-TaskInvocation adam_update(SGDOptimizerAttrs const &);
+TaskInvocation adam_update(AdamOptimizerAttrs const &,
+                           tensor_guid_t const &weight,
+                           tensor_guid_t const &,
+                           tensor_guid_t const &);
 TaskImplFunction get_adam_update_task_impl();
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
index 1ca422d8e1..a39d55adff 100644
--- a/lib/local-execution/src/local_cost_estimator.cc
+++ b/lib/local-execution/src/local_cost_estimator.cc
@@ -72,11 +72,12 @@ CostDetails LocalCostEstimator::estimate_cost(
                            get_vector_piece_attrs(weights),
                            get_vector_piece_attrs(outputs));
 
+  std::optional<ModelTrainingInstance> model_training_instance = std::nullopt;
   LocalTrainingBacking local_backing(allocator,
                                      cg_builder.computation_graph,
                                      tensor_backing_map,
                                      this->runtime_arg_config,
-                                     std::nullopt);
+                                     model_training_instance);
 
   local_backing.execute_init();
   PerLayerElapsedTime fwd = local_backing.execute_forward();
diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc
index 787c7dda86..5059f29abd 100644
--- a/lib/local-execution/src/local_slots_backing.cc
+++ b/lib/local-execution/src/local_slots_backing.cc
@@ -47,6 +47,27 @@ void LocalSlotsBacking::allocate_outgoing_tensors(
   this->output_tensor_slots.insert({layer_guid, outgoing_tensors});
 }
 
+void LocalSlotsBacking::allocate_optimizer_tensors(
+    layer_guid_t const &weight_layer,
+    tensor_guid_t const &weight,
+    ComputationGraph const &cg,
+    Allocator &allocator,
+    TaskSignature const &sig) {
+  GenericTensorAccessorW weight_backing =
+      get_tensor_backing(weight, IsGrad::NO);
+  int num_buffer_tensors =
+      sig.tensor_guid_slots.size() - 2; // ignore 2 (weight and weight_grad)
+  std::vector<tensor_guid_t> buffer_tensors =
+      get_new_tensor_guids_for_layer_without_graph_insertion(
+          cg, weight_layer, num_buffer_tensors);
+  for (auto const &tensor_guid : buffer_tensors) {
+    GenericTensorAccessorW buffer_backing = allocator.allocate_tensor(
+        get_tensor_shape(weight_backing.shape, weight_backing.data_type));
+    this->gradient_tensor_mapping.insert({tensor_guid, buffer_backing});
+  }
+  this->weight_optimizer_tensor_guids.insert({weight, buffer_tensors});
+}
+
 bool LocalSlotsBacking::is_tensor_allocated(
     tensor_guid_t const &tensor_id) const {
   return contains_key(this->tensor_mapping, tensor_id);
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index c8f5f279d2..eb49f16df1 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -1,10 +1,12 @@
 #include "local-execution/local_training_backing.h"
 #include "local-execution/loss_functions.h"
+#include "local-execution/model_training_instance.h"
+#include "local-execution/optimizer.h"
 #include "local-execution/task_signature_impl.h"
 #include "utils/containers/contains.h"
 #include "utils/containers/contains_key.h"
-#include "utils/containers/reversed.h"
 #include "utils/containers/get_only.h"
+#include "utils/containers/reversed.h"
 #include "utils/exception.h"
 
 namespace FlexFlow {
@@ -14,22 +16,33 @@ LocalTrainingBacking::LocalTrainingBacking(
     ComputationGraph const &computation_graph,
     TensorBackingMap const &tensor_backing_mapping,
     RuntimeArgConfig const &runtime_arg_config,
-    std::optional<ModelTrainingInstance> const &training_instance)
+    std::optional<ModelTrainingInstance> &training_instance)
     : allocator(allocator), computation_graph(computation_graph),
       local_slots_backing(tensor_backing_mapping, runtime_arg_config),
       task_registry(empty_task_registry()),
       training_instance(training_instance) {
 
-  for (layer_guid_t const &node : topological_ordering(computation_graph)) {
+  for (layer_guid_t const &node :
+       topological_ordering(this->computation_graph)) {
     ComputationGraphOpAttrs attrs =
-        get_layer_attrs(computation_graph, node).attrs;
+        get_layer_attrs(this->computation_graph, node).attrs;
 
     // allocate outgoing tensors
     this->local_slots_backing.allocate_outgoing_tensors(
-        node, computation_graph, this->allocator);
+        node, this->computation_graph, this->allocator);
 
     // register tasks
     register_tasks_for_layer(this->task_registry, node, attrs);
+
+    // allocate optimizer buffers
+    if (attrs.has<WeightAttrs>() && this->training_instance.has_value()) {
+      OptimizerAttrs attrs = this->training_instance.value().optimizer_attrs;
+      TaskSignature sig = get_update_signature(attrs);
+      tensor_guid_t weight_tensor =
+          get_only(get_outgoing_tensors(this->computation_graph, node));
+      this->local_slots_backing.allocate_optimizer_tensors(
+          node, weight_tensor, this->computation_graph, this->allocator, sig);
+    }
   }
 
   if (this->training_instance.has_value()) {
@@ -137,13 +150,33 @@ PerLayerElapsedTime LocalTrainingBacking::execute_backward() {
 }
 
 void LocalTrainingBacking::execute_update() {
-  for (layer_guid_t const &node: topological_ordering(this->computation_graph)) {
+  assert(this->training_instance.has_value());
+  OptimizerAttrs attrs = this->training_instance.value().optimizer_attrs;
+
+  for (layer_guid_t const &node :
+       topological_ordering(this->computation_graph)) {
     LayerAttrs layer_attrs = get_layer_attrs(this->computation_graph, node);
     if (layer_attrs.attrs.has<WeightAttrs>()) {
-      tensor_guid_t weight_tensor = get_only(get_outgoing_tensors(this->computation_graph, node));
-      // TODO: handle momentum vectors separately? handle different updates?
+      // get tensors
+      tensor_guid_t weight_tensor =
+          get_only(get_outgoing_tensors(this->computation_graph, node));
+      std::vector<tensor_guid_t> buffer_tensors =
+          this->local_slots_backing.weight_optimizer_tensor_guids.at(
+              weight_tensor);
+
+      // get invocation
+      TaskInvocation invocation =
+          get_update_invocation(attrs, weight_tensor, buffer_tensors);
+      assert(is_invocation_valid(get_update_signature(attrs), invocation));
+
+      // execute update
+      TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation);
+      TaskImplFunction update_impl_fn = get_update_task_impl(attrs);
+      update_impl_fn.get<GenericTaskImplFunction>().function_ptr(accessor);
     }
   }
+
+  this->training_instance = next(this->training_instance.value());
 }
 
 TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor(
diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc
new file mode 100644
index 0000000000..646e3ac588
--- /dev/null
+++ b/lib/local-execution/src/model_training_instance.cc
@@ -0,0 +1,26 @@
+#include "local-execution/model_training_instance.h"
+
+namespace FlexFlow {
+  
+ModelTrainingInstance next(ModelTrainingInstance const & old_training_instance) {
+  if (old_training_instance.optimizer_attrs.has<AdamOptimizerAttrs>()) {
+    AdamOptimizerAttrs old = old_training_instance.optimizer_attrs.get<AdamOptimizerAttrs>();
+    double new_beta1_t = old.beta_t * old.beta1;
+    double new_beta2_t = old.beta2_t * old.beta2;
+    double new_alpha_t = old.alpha * sqrt(1 - new_beta2_t) / (1 - new_beta1_t);
+    OptimizerAttrs new_attrs = OptimizerAttrs{AdamOptimizerAttrs{
+      old.alpha,
+      old.beta1,
+      old.beta2,
+      old.weight_decay,
+      new_alpha_t,
+      new_beta1_t,
+      new_beta2_t,
+      old.epsilon
+    }};
+    return ModelTrainingInstance{old_training_instance.loss_attrs, old_training_instance.label_tensor, old_training_instance.logit_tensor, new_attrs};
+  }
+  return old_training_instance;
+}
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc
index 2f45802978..1b1173c70e 100644
--- a/lib/local-execution/src/optimizer.cc
+++ b/lib/local-execution/src/optimizer.cc
@@ -1,18 +1,11 @@
-#include "kernels/optimizer_kernels.h"
 #include "local-execution/optimizer.h"
+#include "kernels/optimizer_kernels.h"
 #include "local-execution/profiling.h"
+#include "utils/overload.h"
 
 namespace FlexFlow {
 
-enum Slots {
-  ATTRS,
-  WEIGHT,
-  SGD_V,
-  PROFILING,
-  ADAM_M,
-  ADAM_V,
-  HANDLE
-};
+enum Slots { ATTRS, WEIGHT, SGD_V, PROFILING, ADAM_M, ADAM_V, HANDLE };
 
 TaskSignature get_sgd_update_signature() {
   TaskSignature sig = make_empty_task_signature();
@@ -27,9 +20,9 @@ TaskSignature get_sgd_update_signature() {
   return sig;
 }
 
-TaskInvocation sgd_update(SGDOptimizerAttrs const & attrs,
-                          tensor_guid_t const & weight,
-                          tensor_guid_t const & sgd_v) {
+TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs,
+                          tensor_guid_t const &weight,
+                          tensor_guid_t const &sgd_v) {
   TaskBinding b;
   b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::YES});
   b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::NO});
@@ -46,53 +39,54 @@ TaskInvocation sgd_update(SGDOptimizerAttrs const & attrs,
   return {task_id_t::SGD_UPD_PS_TASK_ID, b};
 }
 
-static void sgd_update_task_impl(TaskArgumentAccessor const & acc) {
+static void sgd_update_task_impl(TaskArgumentAccessor const &acc) {
   auto attrs = acc.get_argument<SGDOptimizerAttrs>(ATTRS);
   auto weight_grad = acc.get_tensor_grad<Permissions::RO>(WEIGHT);
   auto weight = acc.get_tensor<Permissions::RW>(WEIGHT);
   auto profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
-  assert (weight.shape == weight_grad.shape);
+  assert(weight.shape == weight_grad.shape);
   size_t size = weight_grad.shape.get_volume();
 
-  assert (weight_grad.shape.get_volume() & weight.shape.get_volume() == 0);
-  size_t num_replicas = weight_grad.shape.get_volume() / weight.shape.get_volume();
+  assert(weight_grad.shape.get_volume() & weight.shape.get_volume() == 0);
+  size_t num_replicas =
+      weight_grad.shape.get_volume() / weight.shape.get_volume();
 
   float *sgd_v_ptr;
   if (attrs.momentum > 0.0f) {
     auto sgd_v = acc.get_tensor<Permissions::RW>(SGD_V);
-    assert (sgd_v.shape == weight.shape);
+    assert(sgd_v.shape == weight.shape);
     sgd_v_ptr = sgd_v.get_float_ptr();
   }
 
   if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
     auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
     profile(sgd_nccl_update_task_gpu,
-          profiling,
-          "[SGD NCCL] update_time = %.2lfms\n",
-          attrs.lr,
-          attrs.momentum,
-          attrs.nesterov,
-          attrs.weight_decay,
-          handle,
-          weight_grad.get_float_ptr(),
-          size,
-          weight.get_float_ptr(),
-          sgd_v_ptr);
+            profiling,
+            "[SGD NCCL] update_time = %.2lfms\n",
+            attrs.lr,
+            attrs.momentum,
+            attrs.nesterov,
+            attrs.weight_decay,
+            handle,
+            weight_grad.get_float_ptr(),
+            size,
+            weight.get_float_ptr(),
+            sgd_v_ptr);
 
   } else {
     profile(sgd_ps_update_task_gpu,
-          profiling,
-          "[SGD PS] update_time = %.2lfms\n",
-          attrs.lr,
-          attrs.momentum,
-          attrs.nesterov,
-          attrs.weight_decay,
-          weight_grad.get_float_ptr(),
-          size,
-          num_replicas,
-          weight.get_float_ptr(),
-          sgd_v_ptr);
+            profiling,
+            "[SGD PS] update_time = %.2lfms\n",
+            attrs.lr,
+            attrs.momentum,
+            attrs.nesterov,
+            attrs.weight_decay,
+            weight_grad.get_float_ptr(),
+            size,
+            num_replicas,
+            weight.get_float_ptr(),
+            sgd_v_ptr);
   }
 }
 
@@ -114,10 +108,10 @@ TaskSignature get_adam_update_signature() {
   return sig;
 }
 
-TaskInvocation adam_update(AdamOptimizerAttrs const & attrs,
-                           tensor_guid_t const & weight,
-                           tensor_guid_t const & adam_v,
-                           tensor_guid_t const & adam_m) {
+TaskInvocation adam_update(AdamOptimizerAttrs const &attrs,
+                           tensor_guid_t const &weight,
+                           tensor_guid_t const &adam_v,
+                           tensor_guid_t const &adam_m) {
   TaskBinding b;
   b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::YES});
   b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::NO});
@@ -133,7 +127,7 @@ TaskInvocation adam_update(AdamOptimizerAttrs const & attrs,
   return {task_id_t::ADAM_UPD_PS_TASK_ID, b};
 }
 
-static void adam_update_task_impl(TaskArgumentAccessor const & acc) {
+static void adam_update_task_impl(TaskArgumentAccessor const &acc) {
   auto attrs = acc.get_argument<AdamOptimizerAttrs>(ATTRS);
   auto weight_grad = acc.get_tensor_grad<Permissions::RO>(WEIGHT);
   auto weight = acc.get_tensor<Permissions::RW>(WEIGHT);
@@ -142,11 +136,12 @@ static void adam_update_task_impl(TaskArgumentAccessor const & acc) {
 
   auto profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
-  assert (weight.shape == weight_grad.shape);
+  assert(weight.shape == weight_grad.shape);
   size_t size = weight_grad.shape.get_volume();
 
-  assert (weight_grad.shape.get_volume() % weight.shape.get_volume() == 0);
-  size_t num_replicas = weight_grad.shape.get_volume() / weight.shape.get_volume();
+  assert(weight_grad.shape.get_volume() % weight.shape.get_volume() == 0);
+  size_t num_replicas =
+      weight_grad.shape.get_volume() / weight.shape.get_volume();
 
   if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
     auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
@@ -182,24 +177,38 @@ static void adam_update_task_impl(TaskArgumentAccessor const & acc) {
   }
 }
 
-AdamOptimizerAttrs next(AdamOptimizerAttrs const & old) {
-  double new_beta1_t = old.beta_t * old.beta1;
-  double new_beta2_t = old.beta2_t * old.beta2;
-  double new_alpha_t = old.alpha * sqrt(1 - new_beta2_t) / (1 - new_beta1_t);
-  return AdamOptimizerAttrs{
-    old.alpha,
-    old.beta1,
-    old.beta2,
-    old.weight_decay,
-    new_alpha_t,
-    new_beta1_t,
-    new_beta2_t,
-    old.epsilon
-  };
-}
-
 TaskImplFunction get_adam_update_task_impl() {
   return TaskImplFunction{GenericTaskImplFunction{adam_update_task_impl}};
 }
 
+TaskSignature get_update_signature(OptimizerAttrs const &attrs) {
+  return attrs.visit<TaskSignature>(overload{
+      [&](SGDOptimizerAttrs const &s) { return get_sgd_update_signature(); },
+      [&](AdamOptimizerAttrs const &s) {
+        return get_adam_update_signature();
+      }});
 }
+
+TaskInvocation
+    get_update_invocation(OptimizerAttrs const &attrs,
+                          tensor_guid_t const &weight,
+                          std::vector<tensor_guid_t> const &buffer_tensors) {
+  return attrs.visit<TaskInvocation>(
+      overload{[&](SGDOptimizerAttrs const &s) {
+                 return sgd_update(s, weight, buffer_tensors.at(0));
+               },
+               [&](AdamOptimizerAttrs const &s) {
+                 return adam_update(
+                     s, weight, buffer_tensors.at(0), buffer_tensors.at(1));
+               }});
+}
+
+TaskImplFunction get_update_task_impl(OptimizerAttrs const &attrs) {
+  return attrs.visit<TaskImplFunction>(overload{
+      [&](SGDOptimizerAttrs const &s) { return get_sgd_update_task_impl(); },
+      [&](AdamOptimizerAttrs const &s) {
+        return get_adam_update_task_impl();
+      }});
+}
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/test/src/test_loss_function.cc b/lib/local-execution/test/src/test_loss_function.cc
index 9e60c1b979..3d9946c89c 100644
--- a/lib/local-execution/test/src/test_loss_function.cc
+++ b/lib/local-execution/test/src/test_loss_function.cc
@@ -4,6 +4,7 @@
 #include "kernels/managed_per_device_ff_handle.h"
 #include "local-execution/local_training_backing.h"
 #include "pcg/computation_graph_builder.h"
+#include "pcg/optimizer_attrs.h"
 #include "test_utils.h"
 
 namespace FlexFlow {
@@ -18,6 +19,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         EnableProfiling::NO,
         ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/0}};
 
+    OptimizerAttrs optimizer_attrs = make_empty_sgd_attrs();
+
     // construct graph
     ComputationGraphBuilder cg_builder;
 
@@ -47,11 +50,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       GenericTensorAccessorW label_backing =
           allocator.allocate_tensor(label_shape);
       tensor_backing_map.insert({label_tensor, label_backing});
-      ModelTrainingInstance model_training_instance = ModelTrainingInstance{
-          LossAttrs{
-              SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}},
-          label_tensor,
-          logit_tensor};
+      std::optional<ModelTrainingInstance> model_training_instance =
+          ModelTrainingInstance{
+              LossAttrs{SparseCategoricalCrossEntropyLossAttrs{
+                  /*replace_labels=*/false}},
+              label_tensor,
+              logit_tensor,
+              optimizer_attrs};
       LocalTrainingBacking local_backing(allocator,
                                          cg_builder.computation_graph,
                                          tensor_backing_map,
@@ -70,10 +75,12 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       tensor_backing_map.insert({label_tensor, label_backing});
 
       SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") {
-        ModelTrainingInstance model_training_instance = ModelTrainingInstance{
-            LossAttrs{OtherLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}},
-            label_tensor,
-            logit_tensor};
+        std::optional<ModelTrainingInstance> model_training_instance =
+            ModelTrainingInstance{LossAttrs{OtherLossAttrs{
+                                      LossFunction::CATEGORICAL_CROSSENTROPY}},
+                                  label_tensor,
+                                  logit_tensor,
+                                  optimizer_attrs};
         LocalTrainingBacking local_backing(allocator,
                                            cg_builder.computation_graph,
                                            tensor_backing_map,
@@ -85,11 +92,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       }
 
       SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") {
-        ModelTrainingInstance model_training_instance = ModelTrainingInstance{
-            LossAttrs{
-                OtherLossAttrs{LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}},
-            label_tensor,
-            logit_tensor};
+        std::optional<ModelTrainingInstance> model_training_instance =
+            ModelTrainingInstance{
+                LossAttrs{OtherLossAttrs{
+                    LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}},
+                label_tensor,
+                logit_tensor,
+                optimizer_attrs};
         LocalTrainingBacking local_backing(allocator,
                                            cg_builder.computation_graph,
                                            tensor_backing_map,
@@ -101,10 +110,12 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       }
 
       SUBCASE("LossFunction::IDENTITY") {
-        ModelTrainingInstance model_training_instance = ModelTrainingInstance{
-            LossAttrs{OtherLossAttrs{LossFunction::IDENTITY}},
-            label_tensor,
-            logit_tensor};
+        std::optional<ModelTrainingInstance> model_training_instance =
+            ModelTrainingInstance{
+                LossAttrs{OtherLossAttrs{LossFunction::IDENTITY}},
+                label_tensor,
+                logit_tensor,
+                optimizer_attrs};
         LocalTrainingBacking local_backing(allocator,
                                            cg_builder.computation_graph,
                                            tensor_backing_map,
diff --git a/lib/pcg/include/pcg/computation_graph.h b/lib/pcg/include/pcg/computation_graph.h
index 46d5b22afb..6fbac987ec 100644
--- a/lib/pcg/include/pcg/computation_graph.h
+++ b/lib/pcg/include/pcg/computation_graph.h
@@ -32,6 +32,10 @@ LayerAttrs get_layer_attrs(ComputationGraph const &cg, layer_guid_t const &n);
 layer_guid_t get_layer_by_name(ComputationGraph const &cg,
                                std::string const &name);
 
+std::vector<tensor_guid_t>
+    get_new_tensor_guids_for_layer_without_graph_insertion(
+        ComputationGraph const &, layer_guid_t const &n, int num_tensors);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/pcg/include/pcg/optimizer_attrs.h b/lib/pcg/include/pcg/optimizer_attrs.h
new file mode 100644
index 0000000000..b154116a4d
--- /dev/null
+++ b/lib/pcg/include/pcg/optimizer_attrs.h
@@ -0,0 +1,13 @@
+#ifndef _FLEXFLOW_LIB_PCG_INCLUDE_PCG_OPTIMIZER_ATTRS_H
+#define _FLEXFLOW_LIB_PCG_INCLUDE_PCG_OPTIMIZER_ATTRS_H
+
+#include "pcg/optimizer_attrs.dtg.h"
+
+namespace FlexFlow {
+  
+OptimizerAttrs make_empty_sgd_attrs();
+OptimizerAttrs make_empty_adam_attrs();
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/pcg/src/pcg/computation_graph.cc b/lib/pcg/src/pcg/computation_graph.cc
index afa1774858..23ddd98f3c 100644
--- a/lib/pcg/src/pcg/computation_graph.cc
+++ b/lib/pcg/src/pcg/computation_graph.cc
@@ -64,4 +64,17 @@ layer_guid_t get_layer_by_name(ComputationGraph const &cg,
   return get_only(found);
 }
 
+std::vector<tensor_guid_t>
+    get_new_tensor_guids_for_layer_without_graph_insertion(
+        ComputationGraph const &cg, layer_guid_t const &n, int num_tensors) {
+  std::vector<tensor_guid_t> new_tensor_guids;
+  int num_outgoing_tensors = get_outgoing_tensors(cg, n).size();
+
+  for (int i = 0; i < num_tensors; ++i) {
+    new_tensor_guids.push_back(
+        tensor_guid_t{DataflowOutput{n.raw_node, num_outgoing_tensors + i}});
+  }
+  return new_tensor_guids;
+}
+
 } // namespace FlexFlow
diff --git a/lib/pcg/src/pcg/optimizer_attrs.cc b/lib/pcg/src/pcg/optimizer_attrs.cc
new file mode 100644
index 0000000000..a1c2a2e6d4
--- /dev/null
+++ b/lib/pcg/src/pcg/optimizer_attrs.cc
@@ -0,0 +1,14 @@
+#include "pcg/optimizer_attrs.h"
+
+namespace FlexFlow {
+  
+OptimizerAttrs make_empty_sgd_attrs() {
+  return OptimizerAttrs{SGDOptimizerAttrs{0.0, 0.0, false, 0.0}};
+}
+
+OptimizerAttrs make_empty_adam_attrs() {
+  return OptimizerAttrs{AdamOptimizerAttrs{0.0, 0.0, 0.0, 0.0,
+                                           0.0, 0.0, 0.0, 0.0}};
+}
+
+} // namespace FlexFlow

From ad9b9eac557d1d84f3226019a62fddbe3b163cef Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Tue, 27 Aug 2024 14:04:56 -0700
Subject: [PATCH 07/91] Format

---
 .../local-execution/model_training_instance.h |  4 +--
 .../src/model_training_instance.cc            | 31 ++++++++++---------
 lib/pcg/include/pcg/optimizer_attrs.h         |  2 +-
 lib/pcg/src/pcg/optimizer_attrs.cc            |  6 ++--
 4 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h
index 7ea027a636..afc8fa7472 100644
--- a/lib/local-execution/include/local-execution/model_training_instance.h
+++ b/lib/local-execution/include/local-execution/model_training_instance.h
@@ -5,8 +5,8 @@
 #include "local-execution/model_training_instance.dtg.h"
 
 namespace FlexFlow {
-  
-ModelTrainingInstance next(ModelTrainingInstance const & old);
+
+ModelTrainingInstance next(ModelTrainingInstance const &old);
 
 } // namespace FlexFlow
 
diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc
index 646e3ac588..d34cc5d49a 100644
--- a/lib/local-execution/src/model_training_instance.cc
+++ b/lib/local-execution/src/model_training_instance.cc
@@ -1,24 +1,27 @@
 #include "local-execution/model_training_instance.h"
 
 namespace FlexFlow {
-  
-ModelTrainingInstance next(ModelTrainingInstance const & old_training_instance) {
+
+ModelTrainingInstance next(ModelTrainingInstance const &old_training_instance) {
   if (old_training_instance.optimizer_attrs.has<AdamOptimizerAttrs>()) {
-    AdamOptimizerAttrs old = old_training_instance.optimizer_attrs.get<AdamOptimizerAttrs>();
+    AdamOptimizerAttrs old =
+        old_training_instance.optimizer_attrs.get<AdamOptimizerAttrs>();
     double new_beta1_t = old.beta_t * old.beta1;
     double new_beta2_t = old.beta2_t * old.beta2;
     double new_alpha_t = old.alpha * sqrt(1 - new_beta2_t) / (1 - new_beta1_t);
-    OptimizerAttrs new_attrs = OptimizerAttrs{AdamOptimizerAttrs{
-      old.alpha,
-      old.beta1,
-      old.beta2,
-      old.weight_decay,
-      new_alpha_t,
-      new_beta1_t,
-      new_beta2_t,
-      old.epsilon
-    }};
-    return ModelTrainingInstance{old_training_instance.loss_attrs, old_training_instance.label_tensor, old_training_instance.logit_tensor, new_attrs};
+    OptimizerAttrs new_attrs =
+        OptimizerAttrs{AdamOptimizerAttrs{old.alpha,
+                                          old.beta1,
+                                          old.beta2,
+                                          old.weight_decay,
+                                          new_alpha_t,
+                                          new_beta1_t,
+                                          new_beta2_t,
+                                          old.epsilon}};
+    return ModelTrainingInstance{old_training_instance.loss_attrs,
+                                 old_training_instance.label_tensor,
+                                 old_training_instance.logit_tensor,
+                                 new_attrs};
   }
   return old_training_instance;
 }
diff --git a/lib/pcg/include/pcg/optimizer_attrs.h b/lib/pcg/include/pcg/optimizer_attrs.h
index b154116a4d..550bf12cc8 100644
--- a/lib/pcg/include/pcg/optimizer_attrs.h
+++ b/lib/pcg/include/pcg/optimizer_attrs.h
@@ -4,7 +4,7 @@
 #include "pcg/optimizer_attrs.dtg.h"
 
 namespace FlexFlow {
-  
+
 OptimizerAttrs make_empty_sgd_attrs();
 OptimizerAttrs make_empty_adam_attrs();
 
diff --git a/lib/pcg/src/pcg/optimizer_attrs.cc b/lib/pcg/src/pcg/optimizer_attrs.cc
index a1c2a2e6d4..d51070b10d 100644
--- a/lib/pcg/src/pcg/optimizer_attrs.cc
+++ b/lib/pcg/src/pcg/optimizer_attrs.cc
@@ -1,14 +1,14 @@
 #include "pcg/optimizer_attrs.h"
 
 namespace FlexFlow {
-  
+
 OptimizerAttrs make_empty_sgd_attrs() {
   return OptimizerAttrs{SGDOptimizerAttrs{0.0, 0.0, false, 0.0}};
 }
 
 OptimizerAttrs make_empty_adam_attrs() {
-  return OptimizerAttrs{AdamOptimizerAttrs{0.0, 0.0, 0.0, 0.0,
-                                           0.0, 0.0, 0.0, 0.0}};
+  return OptimizerAttrs{
+      AdamOptimizerAttrs{0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}};
 }
 
 } // namespace FlexFlow

From 1ddfadeebdcdcdabe8a84a03ec51fb5bcb02bfd4 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Tue, 27 Aug 2024 14:17:21 -0700
Subject: [PATCH 08/91] E2E update test

---
 ...test_loss_function.cc => test_loss_e2e.cc} |   2 +-
 .../test/src/test_update_e2e.cc               | 128 ++++++++++++++++++
 2 files changed, 129 insertions(+), 1 deletion(-)
 rename lib/local-execution/test/src/{test_loss_function.cc => test_loss_e2e.cc} (99%)
 create mode 100644 lib/local-execution/test/src/test_update_e2e.cc

diff --git a/lib/local-execution/test/src/test_loss_function.cc b/lib/local-execution/test/src/test_loss_e2e.cc
similarity index 99%
rename from lib/local-execution/test/src/test_loss_function.cc
rename to lib/local-execution/test/src/test_loss_e2e.cc
index 3d9946c89c..15bf089b6b 100644
--- a/lib/local-execution/test/src/test_loss_function.cc
+++ b/lib/local-execution/test/src/test_loss_e2e.cc
@@ -10,7 +10,7 @@
 namespace FlexFlow {
 
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
-  TEST_CASE("Loss Function Local Execution") {
+  TEST_CASE("Local Execution E2E") {
     // initialize runtime configs
     ManagedPerDeviceFFHandle managed_handle{};
 
diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc
new file mode 100644
index 0000000000..7f7a90d9a3
--- /dev/null
+++ b/lib/local-execution/test/src/test_update_e2e.cc
@@ -0,0 +1,128 @@
+#include "doctest/doctest.h"
+#include "kernels/local_cuda_allocator.h"
+#include "kernels/managed_ff_stream.h"
+#include "kernels/managed_per_device_ff_handle.h"
+#include "local-execution/local_training_backing.h"
+#include "pcg/computation_graph_builder.h"
+#include "pcg/optimizer_attrs.h"
+#include "test_utils.h"
+
+namespace FlexFlow {
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("Local Execution Update E2E") {
+    // initialize runtime configs
+    ManagedPerDeviceFFHandle managed_handle{};
+
+    RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
+        DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
+        EnableProfiling::NO,
+        ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/0}};
+
+    // construct graph
+    ComputationGraphBuilder cg_builder;
+
+    size_t batch_size = 10;
+    size_t data_dim = 100;
+    TensorShape input_shape = TensorShape{
+        TensorDims{FFOrdered<size_t>{batch_size, data_dim}}, DataType::FLOAT};
+    tensor_guid_t input_tensor =
+        cg_builder.create_tensor(input_shape, CreateGrad::YES);
+
+    float scalar = 4.0;
+    tensor_guid_t logit_tensor =
+        cg_builder.scalar_multiply(input_tensor, scalar);
+
+    // allocate memory
+    Allocator allocator = create_local_cuda_memory_allocator();
+    TensorBackingMap tensor_backing_map;
+    GenericTensorAccessorW input_backing =
+        allocator.allocate_tensor(input_shape);
+    tensor_backing_map.insert({input_tensor, input_backing});
+
+    tensor_guid_t label_tensor =
+        cg_builder.create_tensor(input_shape, CreateGrad::NO);
+    GenericTensorAccessorW label_backing =
+        allocator.allocate_tensor(input_shape);
+    tensor_backing_map.insert({label_tensor, label_backing});
+
+    SUBCASE("SGDOptimizerAttrs") {
+      SUBCASE("momentum=0") {
+        OptimizerAttrs optimizer_attrs = OptimizerAttrs{SGDOptimizerAttrs{
+          /*lr=*/0.001,
+          /*momentum=*/0.0f,
+          /*nesterov=*/false,
+          /*weight_decay=*/0.001
+        }};
+        std::optional<ModelTrainingInstance> model_training_instance =
+            ModelTrainingInstance{
+                LossAttrs{OtherLossAttrs{
+                    LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}},
+                label_tensor,
+                logit_tensor, optimizer_attrs};
+        LocalTrainingBacking local_backing(allocator,
+                                            cg_builder.computation_graph,
+                                            tensor_backing_map,
+                                            runtime_arg_config,
+                                            model_training_instance);
+        local_backing.execute_init();
+        local_backing.execute_forward();
+        local_backing.execute_backward();
+        local_backing.execute_update();
+      }
+      SUBCASE("momentum=0.9") {
+        OptimizerAttrs optimizer_attrs = OptimizerAttrs{SGDOptimizerAttrs{
+          /*lr=*/0.001,
+          /*momentum=*/0.9,
+          /*nesterov=*/false,
+          /*weight_decay=*/0.001
+        }};
+        std::optional<ModelTrainingInstance> model_training_instance =
+            ModelTrainingInstance{
+                LossAttrs{OtherLossAttrs{
+                    LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}},
+                label_tensor,
+                logit_tensor, optimizer_attrs};
+        LocalTrainingBacking local_backing(allocator,
+                                            cg_builder.computation_graph,
+                                            tensor_backing_map,
+                                            runtime_arg_config,
+                                            model_training_instance);
+        local_backing.execute_init();
+        local_backing.execute_forward();
+        local_backing.execute_backward();
+        local_backing.execute_update();
+      }
+    }
+    SUBCASE("AdamOptimizerAttrs") {
+      OptimizerAttrs optimizer_attrs = OptimizerAttrs{AdamOptimizerAttrs{
+        /*alpha=*/ 0.001,           
+        /*beta1=*/ 0.9,             
+        /*beta2=*/ 0.999,           
+        /*weight_decay=*/ 0.001,    
+        /*alpha_t=*/ 0.001,         
+        /*beta_t=*/ 0.9,            
+        /*beta2_t=*/ 0.999,         
+        /*epsilon=*/ 1e-8           
+        }
+      };
+      std::optional<ModelTrainingInstance> model_training_instance =
+          ModelTrainingInstance{
+              LossAttrs{OtherLossAttrs{
+                  LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}},
+              label_tensor,
+              logit_tensor, optimizer_attrs};
+      LocalTrainingBacking local_backing(allocator,
+                                          cg_builder.computation_graph,
+                                          tensor_backing_map,
+                                          runtime_arg_config,
+                                          model_training_instance);
+      local_backing.execute_init();
+      local_backing.execute_forward();
+      local_backing.execute_backward();
+      local_backing.execute_update();
+    }
+  }
+}
+
+} // namespace FlexFlow

From dde9496ada1c18ece558d9ac1b9bb38fbc147417 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Tue, 27 Aug 2024 15:23:00 -0700
Subject: [PATCH 09/91] Format

---
 .../test/src/test_update_e2e.cc               | 79 +++++++++----------
 1 file changed, 39 insertions(+), 40 deletions(-)

diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc
index 7f7a90d9a3..3899f60b83 100644
--- a/lib/local-execution/test/src/test_update_e2e.cc
+++ b/lib/local-execution/test/src/test_update_e2e.cc
@@ -48,46 +48,46 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     SUBCASE("SGDOptimizerAttrs") {
       SUBCASE("momentum=0") {
-        OptimizerAttrs optimizer_attrs = OptimizerAttrs{SGDOptimizerAttrs{
-          /*lr=*/0.001,
-          /*momentum=*/0.0f,
-          /*nesterov=*/false,
-          /*weight_decay=*/0.001
-        }};
+        OptimizerAttrs optimizer_attrs =
+            OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                             /*momentum=*/0.0f,
+                                             /*nesterov=*/false,
+                                             /*weight_decay=*/0.001}};
         std::optional<ModelTrainingInstance> model_training_instance =
             ModelTrainingInstance{
                 LossAttrs{OtherLossAttrs{
                     LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}},
                 label_tensor,
-                logit_tensor, optimizer_attrs};
+                logit_tensor,
+                optimizer_attrs};
         LocalTrainingBacking local_backing(allocator,
-                                            cg_builder.computation_graph,
-                                            tensor_backing_map,
-                                            runtime_arg_config,
-                                            model_training_instance);
+                                           cg_builder.computation_graph,
+                                           tensor_backing_map,
+                                           runtime_arg_config,
+                                           model_training_instance);
         local_backing.execute_init();
         local_backing.execute_forward();
         local_backing.execute_backward();
         local_backing.execute_update();
       }
       SUBCASE("momentum=0.9") {
-        OptimizerAttrs optimizer_attrs = OptimizerAttrs{SGDOptimizerAttrs{
-          /*lr=*/0.001,
-          /*momentum=*/0.9,
-          /*nesterov=*/false,
-          /*weight_decay=*/0.001
-        }};
+        OptimizerAttrs optimizer_attrs =
+            OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                             /*momentum=*/0.9,
+                                             /*nesterov=*/false,
+                                             /*weight_decay=*/0.001}};
         std::optional<ModelTrainingInstance> model_training_instance =
             ModelTrainingInstance{
                 LossAttrs{OtherLossAttrs{
                     LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}},
                 label_tensor,
-                logit_tensor, optimizer_attrs};
+                logit_tensor,
+                optimizer_attrs};
         LocalTrainingBacking local_backing(allocator,
-                                            cg_builder.computation_graph,
-                                            tensor_backing_map,
-                                            runtime_arg_config,
-                                            model_training_instance);
+                                           cg_builder.computation_graph,
+                                           tensor_backing_map,
+                                           runtime_arg_config,
+                                           model_training_instance);
         local_backing.execute_init();
         local_backing.execute_forward();
         local_backing.execute_backward();
@@ -95,28 +95,27 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       }
     }
     SUBCASE("AdamOptimizerAttrs") {
-      OptimizerAttrs optimizer_attrs = OptimizerAttrs{AdamOptimizerAttrs{
-        /*alpha=*/ 0.001,           
-        /*beta1=*/ 0.9,             
-        /*beta2=*/ 0.999,           
-        /*weight_decay=*/ 0.001,    
-        /*alpha_t=*/ 0.001,         
-        /*beta_t=*/ 0.9,            
-        /*beta2_t=*/ 0.999,         
-        /*epsilon=*/ 1e-8           
-        }
-      };
+      OptimizerAttrs optimizer_attrs =
+          OptimizerAttrs{AdamOptimizerAttrs{/*alpha=*/0.001,
+                                            /*beta1=*/0.9,
+                                            /*beta2=*/0.999,
+                                            /*weight_decay=*/0.001,
+                                            /*alpha_t=*/0.001,
+                                            /*beta_t=*/0.9,
+                                            /*beta2_t=*/0.999,
+                                            /*epsilon=*/1e-8}};
       std::optional<ModelTrainingInstance> model_training_instance =
           ModelTrainingInstance{
-              LossAttrs{OtherLossAttrs{
-                  LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}},
+              LossAttrs{
+                  OtherLossAttrs{LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}},
               label_tensor,
-              logit_tensor, optimizer_attrs};
+              logit_tensor,
+              optimizer_attrs};
       LocalTrainingBacking local_backing(allocator,
-                                          cg_builder.computation_graph,
-                                          tensor_backing_map,
-                                          runtime_arg_config,
-                                          model_training_instance);
+                                         cg_builder.computation_graph,
+                                         tensor_backing_map,
+                                         runtime_arg_config,
+                                         model_training_instance);
       local_backing.execute_init();
       local_backing.execute_forward();
       local_backing.execute_backward();

From 59635d827e02dfcc26274784c9d7315985bf86cb Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Wed, 11 Sep 2024 12:59:05 -0700
Subject: [PATCH 10/91] Small fixes

---
 lib/kernels/src/cuda/cuda_helper.cu           | 10 +--
 lib/kernels/src/device.h                      |  1 -
 .../fwd_bwd_op_task_impl_function.h           | 32 ++++++++++
 .../fwd_bwd_task_impl_function.h              | 32 ----------
 .../init_op_task_impl_function.h              | 33 ++++++++++
 .../local-execution/init_task_impl_function.h | 33 ----------
 .../model_training_instance.struct.toml       |  1 -
 .../include/local-execution/optimizer.h       |  8 +--
 .../task_impl_function.variant.toml           | 12 ++--
 .../include/local-execution/task_signature.h  | 23 ++++---
 .../task_signature.struct.toml                |  9 ++-
 .../src/fwd_bwd_op_task_impl_function.cc      | 54 ++++++++++++++++
 .../src/fwd_bwd_task_impl_function.cc         | 54 ----------------
 .../src/init_op_task_impl_function.cc         | 47 ++++++++++++++
 .../src/init_task_impl_function.cc            | 47 --------------
 .../src/local_slots_backing.cc                | 64 ++++++++-----------
 .../src/local_training_backing.cc             |  8 +--
 .../src/model_training_instance.cc            |  3 +-
 lib/local-execution/src/ops/attention.cc      |  6 +-
 lib/local-execution/src/ops/batch_matmul.cc   |  4 +-
 lib/local-execution/src/ops/batch_norm.cc     |  6 +-
 lib/local-execution/src/ops/cast.cc           |  4 +-
 lib/local-execution/src/ops/combine.cc        |  4 +-
 lib/local-execution/src/ops/concat.cc         |  4 +-
 lib/local-execution/src/ops/conv_2d.cc        |  6 +-
 lib/local-execution/src/ops/dropout.cc        |  6 +-
 lib/local-execution/src/ops/element_binary.cc |  6 +-
 lib/local-execution/src/ops/element_unary.cc  |  6 +-
 lib/local-execution/src/ops/flat.cc           |  4 +-
 lib/local-execution/src/ops/gather.cc         |  6 +-
 lib/local-execution/src/ops/layer_norm.cc     |  6 +-
 lib/local-execution/src/ops/linear.cc         |  6 +-
 lib/local-execution/src/ops/pool_2d.cc        |  6 +-
 lib/local-execution/src/ops/reduce.cc         |  6 +-
 lib/local-execution/src/ops/reduction.cc      |  4 +-
 lib/local-execution/src/ops/repartition.cc    |  6 +-
 lib/local-execution/src/ops/replicate.cc      |  4 +-
 lib/local-execution/src/ops/reshape.cc        |  6 +-
 lib/local-execution/src/ops/reverse.cc        |  4 +-
 lib/local-execution/src/ops/softmax.cc        |  6 +-
 lib/local-execution/src/ops/split.cc          |  4 +-
 lib/local-execution/src/ops/topk.cc           |  6 +-
 lib/local-execution/src/ops/transpose.cc      |  6 +-
 lib/local-execution/src/optimizer.cc          | 17 ++---
 lib/local-execution/src/task_invocation.cc    |  3 +-
 lib/local-execution/test/src/test_loss_e2e.cc | 12 ++--
 .../test/src/test_update_e2e.cc               |  6 +-
 .../op-attrs/ops/loss_attrs.variant.toml      |  6 +-
 .../include/op-attrs/ops/loss_functions.h     | 10 +--
 ...=> nonconfigurable_loss_attrs.struct.toml} |  2 +-
 lib/op-attrs/src/loss_functions.cc            |  2 +-
 lib/pcg/include/pcg/optimizer_attrs.h         | 13 ----
 lib/pcg/src/pcg/optimizer_attrs.cc            | 14 ----
 53 files changed, 327 insertions(+), 361 deletions(-)
 create mode 100644 lib/local-execution/include/local-execution/fwd_bwd_op_task_impl_function.h
 delete mode 100644 lib/local-execution/include/local-execution/fwd_bwd_task_impl_function.h
 create mode 100644 lib/local-execution/include/local-execution/init_op_task_impl_function.h
 delete mode 100644 lib/local-execution/include/local-execution/init_task_impl_function.h
 create mode 100644 lib/local-execution/src/fwd_bwd_op_task_impl_function.cc
 delete mode 100644 lib/local-execution/src/fwd_bwd_task_impl_function.cc
 create mode 100644 lib/local-execution/src/init_op_task_impl_function.cc
 delete mode 100644 lib/local-execution/src/init_task_impl_function.cc
 rename lib/op-attrs/include/op-attrs/ops/{other_loss_attrs.struct.toml => nonconfigurable_loss_attrs.struct.toml} (86%)
 delete mode 100644 lib/pcg/include/pcg/optimizer_attrs.h
 delete mode 100644 lib/pcg/src/pcg/optimizer_attrs.cc

diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu
index 5a303ca15e..4ad22b3a57 100644
--- a/lib/kernels/src/cuda/cuda_helper.cu
+++ b/lib/kernels/src/cuda/cuda_helper.cu
@@ -29,19 +29,13 @@ cudaError_t get_legion_stream(cudaStream_t *stream) {
 #error "Unknown device, please make sure if CUDA is enabled"
 #endif
 
-__global__ void scale_kernel(float *ptr, coord_t size, float a, float b) {
+__global__ void scale_kernel(float *ptr, size_t size, float a, float b) {
   CUDA_KERNEL_LOOP(i, size) {
     ptr[i] = (b - a) * ptr[i] + a;
   }
 }
 
-__global__ void scale_kernel(float *ptr, unsigned long size, float a, float b) {
-  CUDA_KERNEL_LOOP(i, size) {
-    ptr[i] = (b - a) * ptr[i] + a;
-  }
-}
-
-__global__ void ones_kernel(float *ptr, coord_t size) {
+__global__ void ones_kernel(float *ptr, size_t size) {
   CUDA_KERNEL_LOOP(i, size) {
     ptr[i] = 1.0f;
   }
diff --git a/lib/kernels/src/device.h b/lib/kernels/src/device.h
index e32805fde3..ceff2f92ff 100644
--- a/lib/kernels/src/device.h
+++ b/lib/kernels/src/device.h
@@ -71,7 +71,6 @@ inline int GET_BLOCKS(int const N) {
 }
 
 __global__ void scale_kernel(float *ptr, size_t size, float a, float b);
-__global__ void scale_kernel(float *ptr, unsigned long size, float a, float b);
 
 __global__ void ones_kernel(float *ptr, size_t size);
 
diff --git a/lib/local-execution/include/local-execution/fwd_bwd_op_task_impl_function.h b/lib/local-execution/include/local-execution/fwd_bwd_op_task_impl_function.h
new file mode 100644
index 0000000000..cc82291f6a
--- /dev/null
+++ b/lib/local-execution/include/local-execution/fwd_bwd_op_task_impl_function.h
@@ -0,0 +1,32 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_FWD_BWD_TASK_IMPL_FUNCTION_H
+#define _FLEXFLOW_LOCAL_EXECUTION_FWD_BWD_TASK_IMPL_FUNCTION_H
+
+#include "local-execution/task_argument_accessor.h"
+
+namespace FlexFlow {
+
+struct FwdBwdOpTaskImplFunction {
+
+  std::optional<float> (*function_ptr)(TaskArgumentAccessor const &);
+
+  bool operator==(FwdBwdOpTaskImplFunction const &) const;
+  bool operator!=(FwdBwdOpTaskImplFunction const &) const;
+  bool operator<(FwdBwdOpTaskImplFunction const &) const;
+  bool operator>(FwdBwdOpTaskImplFunction const &) const;
+  bool operator<=(FwdBwdOpTaskImplFunction const &) const;
+  bool operator>=(FwdBwdOpTaskImplFunction const &) const;
+};
+
+std::string format_as(FwdBwdOpTaskImplFunction const &x);
+std::ostream &operator<<(std::ostream &s, FwdBwdOpTaskImplFunction const &x);
+
+} // namespace FlexFlow
+
+namespace std {
+template <>
+struct hash<::FlexFlow::FwdBwdOpTaskImplFunction> {
+  size_t operator()(::FlexFlow::FwdBwdOpTaskImplFunction const &) const;
+};
+} // namespace std
+
+#endif
diff --git a/lib/local-execution/include/local-execution/fwd_bwd_task_impl_function.h b/lib/local-execution/include/local-execution/fwd_bwd_task_impl_function.h
deleted file mode 100644
index 7f80af77f3..0000000000
--- a/lib/local-execution/include/local-execution/fwd_bwd_task_impl_function.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_FWD_BWD_TASK_IMPL_FUNCTION_H
-#define _FLEXFLOW_LOCAL_EXECUTION_FWD_BWD_TASK_IMPL_FUNCTION_H
-
-#include "local-execution/task_argument_accessor.h"
-
-namespace FlexFlow {
-
-struct FwdBwdTaskImplFunction {
-
-  std::optional<float> (*function_ptr)(TaskArgumentAccessor const &);
-
-  bool operator==(FwdBwdTaskImplFunction const &) const;
-  bool operator!=(FwdBwdTaskImplFunction const &) const;
-  bool operator<(FwdBwdTaskImplFunction const &) const;
-  bool operator>(FwdBwdTaskImplFunction const &) const;
-  bool operator<=(FwdBwdTaskImplFunction const &) const;
-  bool operator>=(FwdBwdTaskImplFunction const &) const;
-};
-
-std::string format_as(FwdBwdTaskImplFunction const &x);
-std::ostream &operator<<(std::ostream &s, FwdBwdTaskImplFunction const &x);
-
-} // namespace FlexFlow
-
-namespace std {
-template <>
-struct hash<::FlexFlow::FwdBwdTaskImplFunction> {
-  size_t operator()(::FlexFlow::FwdBwdTaskImplFunction const &) const;
-};
-} // namespace std
-
-#endif
diff --git a/lib/local-execution/include/local-execution/init_op_task_impl_function.h b/lib/local-execution/include/local-execution/init_op_task_impl_function.h
new file mode 100644
index 0000000000..7b23a2bc64
--- /dev/null
+++ b/lib/local-execution/include/local-execution/init_op_task_impl_function.h
@@ -0,0 +1,33 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_INIT_TASK_IMPL_FUNCTION_H
+#define _FLEXFLOW_LOCAL_EXECUTION_INIT_TASK_IMPL_FUNCTION_H
+
+#include "local-execution/device_specific_device_states.dtg.h"
+#include "local-execution/task_argument_accessor.h"
+
+namespace FlexFlow {
+
+struct InitOpTaskImplFunction {
+
+  DeviceSpecificDeviceStates (*function_ptr)(TaskArgumentAccessor const &);
+
+  bool operator==(InitOpTaskImplFunction const &) const;
+  bool operator!=(InitOpTaskImplFunction const &) const;
+  bool operator<(InitOpTaskImplFunction const &) const;
+  bool operator>(InitOpTaskImplFunction const &) const;
+  bool operator<=(InitOpTaskImplFunction const &) const;
+  bool operator>=(InitOpTaskImplFunction const &) const;
+};
+
+std::string format_as(InitOpTaskImplFunction const &x);
+std::ostream &operator<<(std::ostream &s, InitOpTaskImplFunction const &x);
+
+} // namespace FlexFlow
+
+namespace std {
+template <>
+struct hash<::FlexFlow::InitOpTaskImplFunction> {
+  size_t operator()(::FlexFlow::InitOpTaskImplFunction const &) const;
+};
+} // namespace std
+
+#endif
diff --git a/lib/local-execution/include/local-execution/init_task_impl_function.h b/lib/local-execution/include/local-execution/init_task_impl_function.h
deleted file mode 100644
index b85944e13a..0000000000
--- a/lib/local-execution/include/local-execution/init_task_impl_function.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_INIT_TASK_IMPL_FUNCTION_H
-#define _FLEXFLOW_LOCAL_EXECUTION_INIT_TASK_IMPL_FUNCTION_H
-
-#include "local-execution/device_specific_device_states.dtg.h"
-#include "local-execution/task_argument_accessor.h"
-
-namespace FlexFlow {
-
-struct InitTaskImplFunction {
-
-  DeviceSpecificDeviceStates (*function_ptr)(TaskArgumentAccessor const &);
-
-  bool operator==(InitTaskImplFunction const &) const;
-  bool operator!=(InitTaskImplFunction const &) const;
-  bool operator<(InitTaskImplFunction const &) const;
-  bool operator>(InitTaskImplFunction const &) const;
-  bool operator<=(InitTaskImplFunction const &) const;
-  bool operator>=(InitTaskImplFunction const &) const;
-};
-
-std::string format_as(InitTaskImplFunction const &x);
-std::ostream &operator<<(std::ostream &s, InitTaskImplFunction const &x);
-
-} // namespace FlexFlow
-
-namespace std {
-template <>
-struct hash<::FlexFlow::InitTaskImplFunction> {
-  size_t operator()(::FlexFlow::InitTaskImplFunction const &) const;
-};
-} // namespace std
-
-#endif
diff --git a/lib/local-execution/include/local-execution/model_training_instance.struct.toml b/lib/local-execution/include/local-execution/model_training_instance.struct.toml
index e3ff397e39..b460d6bd3a 100644
--- a/lib/local-execution/include/local-execution/model_training_instance.struct.toml
+++ b/lib/local-execution/include/local-execution/model_training_instance.struct.toml
@@ -8,7 +8,6 @@ features = [
 ]
 
 includes = [
-  "utils/optional.h",
   "op-attrs/ops/loss_attrs.dtg.h",
   "pcg/tensor_guid_t.dtg.h",
   "pcg/optimizer_attrs.dtg.h",
diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h
index 53dcad63de..e1f11b8a68 100644
--- a/lib/local-execution/include/local-execution/optimizer.h
+++ b/lib/local-execution/include/local-execution/optimizer.h
@@ -13,20 +13,20 @@ namespace FlexFlow {
 TaskSignature get_update_signature(OptimizerAttrs const &);
 TaskInvocation get_update_invocation(OptimizerAttrs const &,
                                      tensor_guid_t const &weight,
-                                     std::vector<tensor_guid_t> const &);
+                                     std::vector<tensor_guid_t> const &grad_buffer_tensors);
 TaskImplFunction get_update_task_impl(OptimizerAttrs const &);
 
 TaskSignature get_sgd_update_signature();
 TaskInvocation sgd_update(SGDOptimizerAttrs const &,
                           tensor_guid_t const &weight,
-                          tensor_guid_t const &);
+                          tensor_guid_t const &sgd_v);
 TaskImplFunction get_sgd_update_task_impl();
 
 TaskSignature get_adam_update_signature();
 TaskInvocation adam_update(AdamOptimizerAttrs const &,
                            tensor_guid_t const &weight,
-                           tensor_guid_t const &,
-                           tensor_guid_t const &);
+                           tensor_guid_t const &adam_v,
+                           tensor_guid_t const &adam_m);
 TaskImplFunction get_adam_update_task_impl();
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/include/local-execution/task_impl_function.variant.toml b/lib/local-execution/include/local-execution/task_impl_function.variant.toml
index 1be18bebfa..48cab9eb01 100644
--- a/lib/local-execution/include/local-execution/task_impl_function.variant.toml
+++ b/lib/local-execution/include/local-execution/task_impl_function.variant.toml
@@ -8,18 +8,18 @@ features = [
 ]
 
 includes = [
-  "local-execution/init_task_impl_function.h",
-  "local-execution/fwd_bwd_task_impl_function.h",
+  "local-execution/init_op_task_impl_function.h",
+  "local-execution/fwd_bwd_op_task_impl_function.h",
   "local-execution/generic_task_impl_function.h",
 ]
 
 [[values]]
-type = "::FlexFlow::InitTaskImplFunction"
-key = "init_task_impl_function"
+type = "::FlexFlow::InitOpTaskImplFunction"
+key = "init_op_task_impl_function"
 
 [[values]]
-type = "::FlexFlow::FwdBwdTaskImplFunction"
-key = "fwd_bwd_task_impl_function"
+type = "::FlexFlow::FwdBwdOpTaskImplFunction"
+key = "fwd_bwd_op_task_impl_function"
 
 [[values]]
 type = "::FlexFlow::GenericTaskImplFunction"
diff --git a/lib/local-execution/include/local-execution/task_signature.h b/lib/local-execution/include/local-execution/task_signature.h
index d31a67e027..ed28f8eaea 100644
--- a/lib/local-execution/include/local-execution/task_signature.h
+++ b/lib/local-execution/include/local-execution/task_signature.h
@@ -1,13 +1,8 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_H
 #define _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_H
 
-// #include "local-execution/tensor_guid_slot_spec.dtg.h"
-// #include "local-execution/serialization.h"
-// #include "utils/hash/unordered_map.h"
-// #include "utils/hash/unordered_set.h"
-// #include "utils/type_index.h"
-
 #include "local-execution/task_signature.dtg.h"
+#include "utils/type_index.h"
 
 namespace FlexFlow {
 
@@ -38,15 +33,23 @@ void add_return_value(TaskSignature &task_signature) {
   task_signature.return_value = get_type_index_for_type<T>();
 }
 
-// adds arg_slot without checking is_serializable, used for arguments that are
-// deviceSpecific
+/**
+ * @brief Adds an argument slot without checking if it is serializable.
+ * 
+ * This function is used for arguments that are device-specific.
+ */
+
 template <typename T>
 void add_unchecked_arg_slot(TaskSignature &task_signature, int name) {
   add_unchecked_arg_slot<T>(task_signature, slot_id_t{name});
 }
 
-// adds arg_slot without checking is_serializable, used for arguments that are
-// deviceSpecific
+/**
+ * @brief Adds an argument slot without checking if it is serializable.
+ * 
+ * This function is used for arguments that are device-specific.
+ */
+
 template <typename T>
 void add_unchecked_arg_slot(TaskSignature &task_signature, slot_id_t name) {
   task_signature.task_arg_types.insert({name, get_type_index_for_type<T>()});
diff --git a/lib/local-execution/include/local-execution/task_signature.struct.toml b/lib/local-execution/include/local-execution/task_signature.struct.toml
index f86f7b0c57..fd15df91d5 100644
--- a/lib/local-execution/include/local-execution/task_signature.struct.toml
+++ b/lib/local-execution/include/local-execution/task_signature.struct.toml
@@ -3,17 +3,22 @@ name = "TaskSignature"
 features = [
   "eq",
   "fmt",
+  "hash"
 ]
 
 includes = [
   "local-execution/tensor_guid_slot_spec.dtg.h",
-  "utils/type_index.h",
-  "utils/optional.h"
+  "<typeindex>",
+  "<optional>"
 ]
 
 src_includes = [
   "utils/fmt/unordered_map.h",
   "utils/fmt/unordered_set.h",
+  "utils/hash/unordered_map.h",
+  "utils/hash/unordered_set.h",
+  "utils/fmt/optional.h",
+  "utils/type_index.h"
 ]
 
 [[fields]]
diff --git a/lib/local-execution/src/fwd_bwd_op_task_impl_function.cc b/lib/local-execution/src/fwd_bwd_op_task_impl_function.cc
new file mode 100644
index 0000000000..308dbfd3ae
--- /dev/null
+++ b/lib/local-execution/src/fwd_bwd_op_task_impl_function.cc
@@ -0,0 +1,54 @@
+#include "local-execution/fwd_bwd_op_task_impl_function.h"
+
+namespace FlexFlow {
+
+bool FwdBwdOpTaskImplFunction::operator==(
+    FwdBwdOpTaskImplFunction const &other) const {
+  return this->function_ptr == other.function_ptr;
+}
+
+bool FwdBwdOpTaskImplFunction::operator!=(
+    FwdBwdOpTaskImplFunction const &other) const {
+  return this->function_ptr != other.function_ptr;
+}
+
+bool FwdBwdOpTaskImplFunction::operator<(
+    FwdBwdOpTaskImplFunction const &other) const {
+  return this->function_ptr < other.function_ptr;
+}
+
+bool FwdBwdOpTaskImplFunction::operator>(
+    FwdBwdOpTaskImplFunction const &other) const {
+  return this->function_ptr > other.function_ptr;
+}
+
+bool FwdBwdOpTaskImplFunction::operator<=(
+    FwdBwdOpTaskImplFunction const &other) const {
+  return this->function_ptr <= other.function_ptr;
+}
+
+bool FwdBwdOpTaskImplFunction::operator>=(
+    FwdBwdOpTaskImplFunction const &other) const {
+  return this->function_ptr >= other.function_ptr;
+}
+
+std::string format_as(FwdBwdOpTaskImplFunction const &x) {
+  std::ostringstream oss;
+  oss << "<FwdBwdOpTaskImplFunction";
+  oss << " function_ptr=" << reinterpret_cast<void *>(x.function_ptr);
+  oss << ">";
+  return oss.str();
+}
+
+std::ostream &operator<<(std::ostream &s, FwdBwdOpTaskImplFunction const &x) {
+  return s << fmt::to_string(x);
+}
+
+} // namespace FlexFlow
+
+namespace std {
+size_t hash<FlexFlow::FwdBwdOpTaskImplFunction>::operator()(
+    ::FlexFlow::FwdBwdOpTaskImplFunction const &x) const {
+  return std::hash<decltype(x.function_ptr)>{}(x.function_ptr);
+}
+} // namespace std
diff --git a/lib/local-execution/src/fwd_bwd_task_impl_function.cc b/lib/local-execution/src/fwd_bwd_task_impl_function.cc
deleted file mode 100644
index f85d7cec61..0000000000
--- a/lib/local-execution/src/fwd_bwd_task_impl_function.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-#include "local-execution/fwd_bwd_task_impl_function.h"
-
-namespace FlexFlow {
-
-bool FwdBwdTaskImplFunction::operator==(
-    FwdBwdTaskImplFunction const &other) const {
-  return this->function_ptr == other.function_ptr;
-}
-
-bool FwdBwdTaskImplFunction::operator!=(
-    FwdBwdTaskImplFunction const &other) const {
-  return this->function_ptr != other.function_ptr;
-}
-
-bool FwdBwdTaskImplFunction::operator<(
-    FwdBwdTaskImplFunction const &other) const {
-  return this->function_ptr < other.function_ptr;
-}
-
-bool FwdBwdTaskImplFunction::operator>(
-    FwdBwdTaskImplFunction const &other) const {
-  return this->function_ptr > other.function_ptr;
-}
-
-bool FwdBwdTaskImplFunction::operator<=(
-    FwdBwdTaskImplFunction const &other) const {
-  return this->function_ptr <= other.function_ptr;
-}
-
-bool FwdBwdTaskImplFunction::operator>=(
-    FwdBwdTaskImplFunction const &other) const {
-  return this->function_ptr >= other.function_ptr;
-}
-
-std::string format_as(FwdBwdTaskImplFunction const &x) {
-  std::ostringstream oss;
-  oss << "<FwdBwdTaskImplFunction";
-  oss << " function_ptr=" << reinterpret_cast<void *>(x.function_ptr);
-  oss << ">";
-  return oss.str();
-}
-
-std::ostream &operator<<(std::ostream &s, FwdBwdTaskImplFunction const &x) {
-  return s << fmt::to_string(x);
-}
-
-} // namespace FlexFlow
-
-namespace std {
-size_t hash<FlexFlow::FwdBwdTaskImplFunction>::operator()(
-    ::FlexFlow::FwdBwdTaskImplFunction const &x) const {
-  return std::hash<decltype(x.function_ptr)>{}(x.function_ptr);
-}
-} // namespace std
diff --git a/lib/local-execution/src/init_op_task_impl_function.cc b/lib/local-execution/src/init_op_task_impl_function.cc
new file mode 100644
index 0000000000..1c946982f5
--- /dev/null
+++ b/lib/local-execution/src/init_op_task_impl_function.cc
@@ -0,0 +1,47 @@
+#include "local-execution/init_op_task_impl_function.h"
+
+namespace FlexFlow {
+
+bool InitOpTaskImplFunction::operator==(InitOpTaskImplFunction const &other) const {
+  return this->function_ptr == other.function_ptr;
+}
+
+bool InitOpTaskImplFunction::operator!=(InitOpTaskImplFunction const &other) const {
+  return this->function_ptr != other.function_ptr;
+}
+
+bool InitOpTaskImplFunction::operator<(InitOpTaskImplFunction const &other) const {
+  return this->function_ptr < other.function_ptr;
+}
+
+bool InitOpTaskImplFunction::operator>(InitOpTaskImplFunction const &other) const {
+  return this->function_ptr > other.function_ptr;
+}
+
+bool InitOpTaskImplFunction::operator<=(InitOpTaskImplFunction const &other) const {
+  return this->function_ptr <= other.function_ptr;
+}
+
+bool InitOpTaskImplFunction::operator>=(InitOpTaskImplFunction const &other) const {
+  return this->function_ptr >= other.function_ptr;
+}
+
+std::string format_as(InitOpTaskImplFunction const &x) {
+  std::ostringstream oss;
+  oss << "<InitOpTaskImplFunction";
+  oss << " function_ptr=" << reinterpret_cast<void *>(x.function_ptr);
+  oss << ">";
+  return oss.str();
+}
+std::ostream &operator<<(std::ostream &s, InitOpTaskImplFunction const &x) {
+  return s << fmt::to_string(x);
+}
+
+} // namespace FlexFlow
+
+namespace std {
+size_t hash<FlexFlow::InitOpTaskImplFunction>::operator()(
+    ::FlexFlow::InitOpTaskImplFunction const &x) const {
+  return std::hash<decltype(x.function_ptr)>{}(x.function_ptr);
+}
+} // namespace std
diff --git a/lib/local-execution/src/init_task_impl_function.cc b/lib/local-execution/src/init_task_impl_function.cc
deleted file mode 100644
index 9501f72dd6..0000000000
--- a/lib/local-execution/src/init_task_impl_function.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-#include "local-execution/init_task_impl_function.h"
-
-namespace FlexFlow {
-
-bool InitTaskImplFunction::operator==(InitTaskImplFunction const &other) const {
-  return this->function_ptr == other.function_ptr;
-}
-
-bool InitTaskImplFunction::operator!=(InitTaskImplFunction const &other) const {
-  return this->function_ptr != other.function_ptr;
-}
-
-bool InitTaskImplFunction::operator<(InitTaskImplFunction const &other) const {
-  return this->function_ptr < other.function_ptr;
-}
-
-bool InitTaskImplFunction::operator>(InitTaskImplFunction const &other) const {
-  return this->function_ptr > other.function_ptr;
-}
-
-bool InitTaskImplFunction::operator<=(InitTaskImplFunction const &other) const {
-  return this->function_ptr <= other.function_ptr;
-}
-
-bool InitTaskImplFunction::operator>=(InitTaskImplFunction const &other) const {
-  return this->function_ptr >= other.function_ptr;
-}
-
-std::string format_as(InitTaskImplFunction const &x) {
-  std::ostringstream oss;
-  oss << "<InitTaskImplFunction";
-  oss << " function_ptr=" << reinterpret_cast<void *>(x.function_ptr);
-  oss << ">";
-  return oss.str();
-}
-std::ostream &operator<<(std::ostream &s, InitTaskImplFunction const &x) {
-  return s << fmt::to_string(x);
-}
-
-} // namespace FlexFlow
-
-namespace std {
-size_t hash<FlexFlow::InitTaskImplFunction>::operator()(
-    ::FlexFlow::InitTaskImplFunction const &x) const {
-  return std::hash<decltype(x.function_ptr)>{}(x.function_ptr);
-}
-} // namespace std
diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc
index 5059f29abd..0a1497b6c8 100644
--- a/lib/local-execution/src/local_slots_backing.cc
+++ b/lib/local-execution/src/local_slots_backing.cc
@@ -1,5 +1,6 @@
 #include "local-execution/local_slots_backing.h"
 #include "utils/containers/contains_key.h"
+#include "utils/containers/map_values.h"
 #include "utils/overload.h"
 
 namespace FlexFlow {
@@ -55,17 +56,17 @@ void LocalSlotsBacking::allocate_optimizer_tensors(
     TaskSignature const &sig) {
   GenericTensorAccessorW weight_backing =
       get_tensor_backing(weight, IsGrad::NO);
-  int num_buffer_tensors =
+  int num_grad_buffer_tensors =
       sig.tensor_guid_slots.size() - 2; // ignore 2 (weight and weight_grad)
-  std::vector<tensor_guid_t> buffer_tensors =
+  std::vector<tensor_guid_t> grad_buffer_tensors =
       get_new_tensor_guids_for_layer_without_graph_insertion(
-          cg, weight_layer, num_buffer_tensors);
-  for (auto const &tensor_guid : buffer_tensors) {
+          cg, weight_layer, num_grad_buffer_tensors);
+  for (tensor_guid_t const &tensor_guid : grad_buffer_tensors) {
     GenericTensorAccessorW buffer_backing = allocator.allocate_tensor(
         get_tensor_shape(weight_backing.shape, weight_backing.data_type));
     this->gradient_tensor_mapping.insert({tensor_guid, buffer_backing});
   }
-  this->weight_optimizer_tensor_guids.insert({weight, buffer_tensors});
+  this->weight_optimizer_tensor_guids.insert({weight, grad_buffer_tensors});
 }
 
 bool LocalSlotsBacking::is_tensor_allocated(
@@ -123,8 +124,7 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing(
         break;
       default:
         throw mk_runtime_error(
-            fmt::format("Invalid TensorRole")); // inserting role yields
-                                                // "type_is_unformattable" error
+            fmt::format("Invalid TensorRole {}", tensor_spec.role));
     }
 
     IsGrad is_grad = slot_grad_id.is_grad;
@@ -154,41 +154,29 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing(
 
 ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing(
     OpTaskBinding const &binding, layer_guid_t const &op_guid) const {
-  ArgSlotsBacking mapping;
-  for (auto const &arg_binding : binding.get_arg_bindings()) {
-    slot_id_t arg_slot = arg_binding.first;
-    OpArgSpec op_arg_spec = arg_binding.second;
-
-    mapping.insert({arg_slot,
-                    op_arg_spec.visit<ConcreteArgSpec>(overload{
-                        [&](OpArgRefSpec const &s) {
-                          return this->resolve_op_arg_ref_spec(s, op_guid);
-                        },
-                        [&](RuntimeArgRefSpec const &s) {
-                          return this->resolve_runtime_arg_ref_spec(s);
-                        },
-                        [](ConcreteArgSpec const &s) { return s; },
-                    })});
-  }
-  return mapping;
+  return map_values(binding.get_arg_bindings(), [&](auto const &arg_binding){
+    return arg_binding.template visit<ConcreteArgSpec>(overload{
+      [&](OpArgRefSpec const &s) {
+        return this->resolve_op_arg_ref_spec(s, op_guid);
+      },
+      [&](RuntimeArgRefSpec const &s) {
+        return this->resolve_runtime_arg_ref_spec(s);
+      },
+      [](ConcreteArgSpec const &s) { return s; }
+    });
+  });
 }
 
 ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing(
     TaskBinding const &binding) const {
-  ArgSlotsBacking mapping;
-  for (auto const &arg_binding : binding.get_arg_bindings()) {
-    slot_id_t arg_slot = arg_binding.first;
-    TaskArgSpec task_arg_spec = arg_binding.second;
-
-    mapping.insert({arg_slot,
-                    task_arg_spec.visit<ConcreteArgSpec>(overload{
-                        [&](RuntimeArgRefSpec const &s) {
-                          return this->resolve_runtime_arg_ref_spec(s);
-                        },
-                        [](ConcreteArgSpec const &s) { return s; },
-                    })});
-  }
-  return mapping;
+  return map_values(binding.get_arg_bindings(), [&](auto const &arg_binding){
+    return arg_binding.template visit<ConcreteArgSpec>(overload{
+      [&](RuntimeArgRefSpec const &s) {
+        return this->resolve_runtime_arg_ref_spec(s);
+      },
+      [](ConcreteArgSpec const &s) { return s; }
+    });
+  });;
 }
 
 ConcreteArgSpec LocalSlotsBacking::resolve_op_arg_ref_spec(
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index eb49f16df1..dff33826b9 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -60,7 +60,7 @@ DeviceSpecificDeviceStates
   TaskSignatureAndImpl task_sig_impl =
       this->task_registry.task_mapping.at(task_id);
   auto fn =
-      task_sig_impl.impl_function.get<InitTaskImplFunction>().function_ptr;
+      task_sig_impl.impl_function.get<InitOpTaskImplFunction>().function_ptr;
   return fn(acc);
 }
 
@@ -70,7 +70,7 @@ std::optional<float>
   TaskSignatureAndImpl task_sig_impl =
       this->task_registry.task_mapping.at(task_id);
   auto fn =
-      task_sig_impl.impl_function.get<FwdBwdTaskImplFunction>().function_ptr;
+      task_sig_impl.impl_function.get<FwdBwdOpTaskImplFunction>().function_ptr;
   return fn(acc);
 }
 
@@ -160,13 +160,13 @@ void LocalTrainingBacking::execute_update() {
       // get tensors
       tensor_guid_t weight_tensor =
           get_only(get_outgoing_tensors(this->computation_graph, node));
-      std::vector<tensor_guid_t> buffer_tensors =
+      std::vector<tensor_guid_t> grad_buffer_tensors =
           this->local_slots_backing.weight_optimizer_tensor_guids.at(
               weight_tensor);
 
       // get invocation
       TaskInvocation invocation =
-          get_update_invocation(attrs, weight_tensor, buffer_tensors);
+          get_update_invocation(attrs, weight_tensor, grad_buffer_tensors);
       assert(is_invocation_valid(get_update_signature(attrs), invocation));
 
       // execute update
diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc
index d34cc5d49a..c626bfc0e0 100644
--- a/lib/local-execution/src/model_training_instance.cc
+++ b/lib/local-execution/src/model_training_instance.cc
@@ -22,8 +22,9 @@ ModelTrainingInstance next(ModelTrainingInstance const &old_training_instance) {
                                  old_training_instance.label_tensor,
                                  old_training_instance.logit_tensor,
                                  new_attrs};
+  } else {
+    return old_training_instance;
   }
-  return old_training_instance;
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/ops/attention.cc b/lib/local-execution/src/ops/attention.cc
index 8ede2cb38b..5e693d43db 100644
--- a/lib/local-execution/src/ops/attention.cc
+++ b/lib/local-execution/src/ops/attention.cc
@@ -202,13 +202,13 @@ static std::optional<float>
 }
 
 TaskImplFunction get_attention_init_task_impl() {
-  return TaskImplFunction{InitTaskImplFunction{init_task_impl}};
+  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
 }
 TaskImplFunction get_attention_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
 TaskImplFunction get_attention_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
 OpTaskSignature get_attention_init_signature() {
diff --git a/lib/local-execution/src/ops/batch_matmul.cc b/lib/local-execution/src/ops/batch_matmul.cc
index 1eae409ae2..d60a003061 100644
--- a/lib/local-execution/src/ops/batch_matmul.cc
+++ b/lib/local-execution/src/ops/batch_matmul.cc
@@ -153,10 +153,10 @@ static std::optional<float>
 }
 
 TaskImplFunction get_batch_matmul_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
 TaskImplFunction get_batch_matmul_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
 OpTaskSignature get_batch_matmul_fwd_signature() {
diff --git a/lib/local-execution/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc
index 851566fc02..254d7ef39e 100644
--- a/lib/local-execution/src/ops/batch_norm.cc
+++ b/lib/local-execution/src/ops/batch_norm.cc
@@ -144,13 +144,13 @@ static std::optional<float>
 }
 
 TaskImplFunction get_batch_norm_init_task_impl() {
-  return TaskImplFunction{InitTaskImplFunction{init_task_impl}};
+  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
 }
 TaskImplFunction get_batch_norm_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
 TaskImplFunction get_batch_norm_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
 OpTaskSignature get_batch_norm_init_signature() {
diff --git a/lib/local-execution/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc
index 3e7baf49a9..d3e43a46a0 100644
--- a/lib/local-execution/src/ops/cast.cc
+++ b/lib/local-execution/src/ops/cast.cc
@@ -79,10 +79,10 @@ static std::optional<float>
 }
 
 TaskImplFunction get_cast_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
 TaskImplFunction get_cast_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
 OpTaskSignature get_cast_fwd_signature() {
diff --git a/lib/local-execution/src/ops/combine.cc b/lib/local-execution/src/ops/combine.cc
index ccc82cce17..92f2931344 100644
--- a/lib/local-execution/src/ops/combine.cc
+++ b/lib/local-execution/src/ops/combine.cc
@@ -85,10 +85,10 @@ OpTaskSignature get_combine_bwd_signature() {
 }
 
 TaskImplFunction get_combine_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
 TaskImplFunction get_combine_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
 }; // namespace FlexFlow
diff --git a/lib/local-execution/src/ops/concat.cc b/lib/local-execution/src/ops/concat.cc
index 35f663b1cd..94d8fc6827 100644
--- a/lib/local-execution/src/ops/concat.cc
+++ b/lib/local-execution/src/ops/concat.cc
@@ -79,10 +79,10 @@ static std::optional<float>
 }
 
 TaskImplFunction get_concat_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
 TaskImplFunction get_concat_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
 OpTaskSignature get_concat_fwd_signature() {
diff --git a/lib/local-execution/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc
index d5c6e7f851..7694a03947 100644
--- a/lib/local-execution/src/ops/conv_2d.cc
+++ b/lib/local-execution/src/ops/conv_2d.cc
@@ -132,13 +132,13 @@ static std::optional<float>
 }
 
 TaskImplFunction get_conv_2d_init_task_impl() {
-  return TaskImplFunction{InitTaskImplFunction{init_task_impl}};
+  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
 }
 TaskImplFunction get_conv_2d_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
 TaskImplFunction get_conv_2d_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
 OpTaskSignature get_conv_2d_init_signature() {
diff --git a/lib/local-execution/src/ops/dropout.cc b/lib/local-execution/src/ops/dropout.cc
index cac08866cc..77a2963313 100644
--- a/lib/local-execution/src/ops/dropout.cc
+++ b/lib/local-execution/src/ops/dropout.cc
@@ -87,13 +87,13 @@ static std::optional<float>
 }
 
 TaskImplFunction get_dropout_init_task_impl() {
-  return TaskImplFunction{InitTaskImplFunction{init_task_impl}};
+  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
 }
 TaskImplFunction get_dropout_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
 TaskImplFunction get_dropout_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
 OpTaskSignature get_dropout_init_signature() {
diff --git a/lib/local-execution/src/ops/element_binary.cc b/lib/local-execution/src/ops/element_binary.cc
index 48c6c699a2..2152b1beea 100644
--- a/lib/local-execution/src/ops/element_binary.cc
+++ b/lib/local-execution/src/ops/element_binary.cc
@@ -126,15 +126,15 @@ static std::optional<float>
 }
 
 TaskImplFunction get_element_binary_init_task_impl() {
-  return TaskImplFunction{InitTaskImplFunction{init_task_impl}};
+  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
 }
 
 TaskImplFunction get_element_binary_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
 
 TaskImplFunction get_element_binary_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
 OpTaskSignature get_element_binary_init_signature() {
diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc
index 502afb5f9f..64a0c5e94e 100644
--- a/lib/local-execution/src/ops/element_unary.cc
+++ b/lib/local-execution/src/ops/element_unary.cc
@@ -115,13 +115,13 @@ static std::optional<float>
 }
 
 TaskImplFunction get_element_unary_init_task_impl() {
-  return TaskImplFunction{InitTaskImplFunction{init_task_impl}};
+  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
 }
 TaskImplFunction get_element_unary_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
 TaskImplFunction get_element_unary_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
 OpTaskSignature get_element_unary_init_signature() {
diff --git a/lib/local-execution/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc
index 3fe5029fa1..8df5703f60 100644
--- a/lib/local-execution/src/ops/flat.cc
+++ b/lib/local-execution/src/ops/flat.cc
@@ -53,10 +53,10 @@ static std::optional<float>
 }
 
 TaskImplFunction get_flat_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
 TaskImplFunction get_flat_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
 OpTaskSignature get_flat_fwd_signature() {
diff --git a/lib/local-execution/src/ops/gather.cc b/lib/local-execution/src/ops/gather.cc
index a015c64f4d..558988f9a4 100644
--- a/lib/local-execution/src/ops/gather.cc
+++ b/lib/local-execution/src/ops/gather.cc
@@ -122,13 +122,13 @@ static std::optional<float>
 }
 
 TaskImplFunction get_gather_init_task_impl() {
-  return TaskImplFunction{InitTaskImplFunction{init_task_impl}};
+  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
 }
 TaskImplFunction get_gather_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
 TaskImplFunction get_gather_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
 OpTaskSignature get_gather_init_signature() {
diff --git a/lib/local-execution/src/ops/layer_norm.cc b/lib/local-execution/src/ops/layer_norm.cc
index e99d27319c..b1f44d69ae 100644
--- a/lib/local-execution/src/ops/layer_norm.cc
+++ b/lib/local-execution/src/ops/layer_norm.cc
@@ -146,13 +146,13 @@ static DeviceSpecificDeviceStates
 }
 
 TaskImplFunction get_layer_norm_init_task_impl() {
-  return TaskImplFunction{InitTaskImplFunction{init_task_impl}};
+  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
 }
 TaskImplFunction get_layer_norm_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
 TaskImplFunction get_layer_norm_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
 OpTaskSignature get_layer_norm_fwd_signature() {
diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc
index 9934e2a45c..9e29a0cce0 100644
--- a/lib/local-execution/src/ops/linear.cc
+++ b/lib/local-execution/src/ops/linear.cc
@@ -161,13 +161,13 @@ static std::optional<float>
 }
 
 TaskImplFunction get_linear_init_task_impl() {
-  return TaskImplFunction{InitTaskImplFunction{init_task_impl}};
+  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
 }
 TaskImplFunction get_linear_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
 TaskImplFunction get_linear_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
 OpTaskSignature get_linear_init_signature() {
diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc
index 789ed2cd63..093a3c1374 100644
--- a/lib/local-execution/src/ops/pool_2d.cc
+++ b/lib/local-execution/src/ops/pool_2d.cc
@@ -142,13 +142,13 @@ static std::optional<float>
 }
 
 TaskImplFunction get_pool_2d_init_task_impl() {
-  return TaskImplFunction{InitTaskImplFunction{init_task_impl}};
+  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
 }
 TaskImplFunction get_pool_2d_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
 TaskImplFunction get_pool_2d_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
 OpTaskSignature get_pool_2d_init_signature() {
diff --git a/lib/local-execution/src/ops/reduce.cc b/lib/local-execution/src/ops/reduce.cc
index a043d9f847..01d2f0e86f 100644
--- a/lib/local-execution/src/ops/reduce.cc
+++ b/lib/local-execution/src/ops/reduce.cc
@@ -102,13 +102,13 @@ static std::optional<float>
 }
 
 TaskImplFunction get_reduce_init_task_impl() {
-  return TaskImplFunction{InitTaskImplFunction{init_task_impl}};
+  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
 }
 TaskImplFunction get_reduce_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
 TaskImplFunction get_reduce_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
 OpTaskSignature get_reduce_init_signature() {
diff --git a/lib/local-execution/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc
index a58d79a4f8..f946b7d146 100644
--- a/lib/local-execution/src/ops/reduction.cc
+++ b/lib/local-execution/src/ops/reduction.cc
@@ -74,10 +74,10 @@ static std::optional<float>
 }
 
 TaskImplFunction get_reduction_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
 TaskImplFunction get_reduction_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
 OpTaskSignature get_reduction_fwd_signature() {
diff --git a/lib/local-execution/src/ops/repartition.cc b/lib/local-execution/src/ops/repartition.cc
index 73692f4a13..e260fd77f5 100644
--- a/lib/local-execution/src/ops/repartition.cc
+++ b/lib/local-execution/src/ops/repartition.cc
@@ -98,13 +98,13 @@ static std::optional<float>
 }
 
 TaskImplFunction get_repartition_init_task_impl() {
-  return TaskImplFunction{InitTaskImplFunction{init_task_impl}};
+  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
 }
 TaskImplFunction get_repartition_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
 TaskImplFunction get_repartition_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
 OpTaskSignature get_repartition_init_signature() {
diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc
index 135475a711..10cd80a6d9 100644
--- a/lib/local-execution/src/ops/replicate.cc
+++ b/lib/local-execution/src/ops/replicate.cc
@@ -73,10 +73,10 @@ static std::optional<float>
 }
 
 TaskImplFunction get_replicate_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
 TaskImplFunction get_replicate_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
 OpTaskSignature get_replicate_fwd_signature() {
diff --git a/lib/local-execution/src/ops/reshape.cc b/lib/local-execution/src/ops/reshape.cc
index 7584d405eb..433e961a8a 100644
--- a/lib/local-execution/src/ops/reshape.cc
+++ b/lib/local-execution/src/ops/reshape.cc
@@ -92,13 +92,13 @@ static std::optional<float>
 }
 
 TaskImplFunction get_reshape_init_task_impl() {
-  return TaskImplFunction{InitTaskImplFunction{init_task_impl}};
+  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
 }
 TaskImplFunction get_reshape_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
 TaskImplFunction get_reshape_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
 OpTaskSignature get_reshape_init_signature() {
diff --git a/lib/local-execution/src/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc
index 366a579bea..b767b61b20 100644
--- a/lib/local-execution/src/ops/reverse.cc
+++ b/lib/local-execution/src/ops/reverse.cc
@@ -103,10 +103,10 @@ static std::optional<float>
 }
 
 TaskImplFunction get_reverse_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
 TaskImplFunction get_reverse_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
 OpTaskSignature get_reverse_fwd_signature() {
diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc
index 4c7979ae9b..36c4afcaf3 100644
--- a/lib/local-execution/src/ops/softmax.cc
+++ b/lib/local-execution/src/ops/softmax.cc
@@ -108,13 +108,13 @@ static std::optional<float>
 }
 
 TaskImplFunction get_softmax_init_task_impl() {
-  return TaskImplFunction{InitTaskImplFunction{init_task_impl}};
+  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
 }
 TaskImplFunction get_softmax_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
 TaskImplFunction get_softmax_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
 OpTaskSignature get_softmax_init_signature() {
diff --git a/lib/local-execution/src/ops/split.cc b/lib/local-execution/src/ops/split.cc
index 9f039d84f8..dc627aae96 100644
--- a/lib/local-execution/src/ops/split.cc
+++ b/lib/local-execution/src/ops/split.cc
@@ -114,10 +114,10 @@ static std::optional<float>
 }
 
 TaskImplFunction get_split_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
 TaskImplFunction get_split_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
 OpTaskSignature get_split_fwd_signature() {
diff --git a/lib/local-execution/src/ops/topk.cc b/lib/local-execution/src/ops/topk.cc
index 7f3519529a..ea4fc09e19 100644
--- a/lib/local-execution/src/ops/topk.cc
+++ b/lib/local-execution/src/ops/topk.cc
@@ -120,13 +120,13 @@ static std::optional<float>
 }
 
 TaskImplFunction get_topk_init_task_impl() {
-  return TaskImplFunction{InitTaskImplFunction{init_task_impl}};
+  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
 }
 TaskImplFunction get_topk_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
 TaskImplFunction get_topk_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
 OpTaskSignature get_topk_init_signature() {
diff --git a/lib/local-execution/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc
index 5c3c1dd1ca..099206e372 100644
--- a/lib/local-execution/src/ops/transpose.cc
+++ b/lib/local-execution/src/ops/transpose.cc
@@ -100,13 +100,13 @@ OpTaskInvocation backward(TransposeAttrs const &attrs) {
 }
 
 TaskImplFunction get_transpose_init_task_impl() {
-  return TaskImplFunction{InitTaskImplFunction{init_task_impl}};
+  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
 }
 TaskImplFunction get_transpose_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
 TaskImplFunction get_transpose_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}};
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
 OpTaskSignature get_transpose_init_signature() {
diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc
index 1b1173c70e..485955a5dc 100644
--- a/lib/local-execution/src/optimizer.cc
+++ b/lib/local-execution/src/optimizer.cc
@@ -35,8 +35,9 @@ TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs,
   if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
     b.bind_arg(HANDLE, ff_handle());
     return {task_id_t::SGD_UPD_NCCL_TASK_ID, b};
+  } else  {
+    return {task_id_t::SGD_UPD_PS_TASK_ID, b};
   }
-  return {task_id_t::SGD_UPD_PS_TASK_ID, b};
 }
 
 static void sgd_update_task_impl(TaskArgumentAccessor const &acc) {
@@ -183,8 +184,8 @@ TaskImplFunction get_adam_update_task_impl() {
 
 TaskSignature get_update_signature(OptimizerAttrs const &attrs) {
   return attrs.visit<TaskSignature>(overload{
-      [&](SGDOptimizerAttrs const &s) { return get_sgd_update_signature(); },
-      [&](AdamOptimizerAttrs const &s) {
+      [&](SGDOptimizerAttrs const &) { return get_sgd_update_signature(); },
+      [&](AdamOptimizerAttrs const &) {
         return get_adam_update_signature();
       }});
 }
@@ -192,21 +193,21 @@ TaskSignature get_update_signature(OptimizerAttrs const &attrs) {
 TaskInvocation
     get_update_invocation(OptimizerAttrs const &attrs,
                           tensor_guid_t const &weight,
-                          std::vector<tensor_guid_t> const &buffer_tensors) {
+                          std::vector<tensor_guid_t> const &grad_buffer_tensors) {
   return attrs.visit<TaskInvocation>(
       overload{[&](SGDOptimizerAttrs const &s) {
-                 return sgd_update(s, weight, buffer_tensors.at(0));
+                 return sgd_update(s, weight, grad_buffer_tensors.at(0));
                },
                [&](AdamOptimizerAttrs const &s) {
                  return adam_update(
-                     s, weight, buffer_tensors.at(0), buffer_tensors.at(1));
+                     s, weight, grad_buffer_tensors.at(0), grad_buffer_tensors.at(1));
                }});
 }
 
 TaskImplFunction get_update_task_impl(OptimizerAttrs const &attrs) {
   return attrs.visit<TaskImplFunction>(overload{
-      [&](SGDOptimizerAttrs const &s) { return get_sgd_update_task_impl(); },
-      [&](AdamOptimizerAttrs const &s) {
+      [&](SGDOptimizerAttrs const &) { return get_sgd_update_task_impl(); },
+      [&](AdamOptimizerAttrs const &) {
         return get_adam_update_task_impl();
       }});
 }
diff --git a/lib/local-execution/src/task_invocation.cc b/lib/local-execution/src/task_invocation.cc
index c64af5332e..e15b9ae4ef 100644
--- a/lib/local-execution/src/task_invocation.cc
+++ b/lib/local-execution/src/task_invocation.cc
@@ -42,8 +42,7 @@ std::unordered_map<slot_id_t, TaskArgSpec> const &
 }
 
 bool is_invocation_valid(TaskSignature const &sig, TaskInvocation const &inv) {
-  // TODO: implement signature checking
-  return true;
+  NOT_IMPLEMENTED();
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc
index 15bf089b6b..740c2a7355 100644
--- a/lib/local-execution/test/src/test_loss_e2e.cc
+++ b/lib/local-execution/test/src/test_loss_e2e.cc
@@ -19,7 +19,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         EnableProfiling::NO,
         ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/0}};
 
-    OptimizerAttrs optimizer_attrs = make_empty_sgd_attrs();
+    OptimizerAttrs optimizer_attrs = OptimizerAttrs{SGDOptimizerAttrs{
+      /*lr=*/0.0, 
+      /*momentum=*/0.0, 
+      /*nesterov=*/false, 
+      /*weight_decay=*/0.0}};
 
     // construct graph
     ComputationGraphBuilder cg_builder;
@@ -76,7 +80,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
       SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") {
         std::optional<ModelTrainingInstance> model_training_instance =
-            ModelTrainingInstance{LossAttrs{OtherLossAttrs{
+            ModelTrainingInstance{LossAttrs{NonconfigurableLossAttrs{
                                       LossFunction::CATEGORICAL_CROSSENTROPY}},
                                   label_tensor,
                                   logit_tensor,
@@ -94,7 +98,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") {
         std::optional<ModelTrainingInstance> model_training_instance =
             ModelTrainingInstance{
-                LossAttrs{OtherLossAttrs{
+                LossAttrs{NonconfigurableLossAttrs{
                     LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}},
                 label_tensor,
                 logit_tensor,
@@ -112,7 +116,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       SUBCASE("LossFunction::IDENTITY") {
         std::optional<ModelTrainingInstance> model_training_instance =
             ModelTrainingInstance{
-                LossAttrs{OtherLossAttrs{LossFunction::IDENTITY}},
+                LossAttrs{NonconfigurableLossAttrs{LossFunction::IDENTITY}},
                 label_tensor,
                 logit_tensor,
                 optimizer_attrs};
diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc
index 3899f60b83..6ad59c8286 100644
--- a/lib/local-execution/test/src/test_update_e2e.cc
+++ b/lib/local-execution/test/src/test_update_e2e.cc
@@ -55,7 +55,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
                                              /*weight_decay=*/0.001}};
         std::optional<ModelTrainingInstance> model_training_instance =
             ModelTrainingInstance{
-                LossAttrs{OtherLossAttrs{
+                LossAttrs{NonconfigurableLossAttrs{
                     LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}},
                 label_tensor,
                 logit_tensor,
@@ -78,7 +78,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
                                              /*weight_decay=*/0.001}};
         std::optional<ModelTrainingInstance> model_training_instance =
             ModelTrainingInstance{
-                LossAttrs{OtherLossAttrs{
+                LossAttrs{NonconfigurableLossAttrs{
                     LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}},
                 label_tensor,
                 logit_tensor,
@@ -107,7 +107,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       std::optional<ModelTrainingInstance> model_training_instance =
           ModelTrainingInstance{
               LossAttrs{
-                  OtherLossAttrs{LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}},
+                  NonconfigurableLossAttrs{LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}},
               label_tensor,
               logit_tensor,
               optimizer_attrs};
diff --git a/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml b/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml
index 8a4f38839c..d60c6507cf 100644
--- a/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml
+++ b/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml
@@ -10,7 +10,7 @@ features = [
 
 includes = [
   "op-attrs/ops/sparse_categorical_ce_loss_attrs.dtg.h",
-  "op-attrs/ops/other_loss_attrs.dtg.h"
+  "op-attrs/ops/nonconfigurable_loss_attrs.dtg.h"
 ]
 
 [[values]]
@@ -18,5 +18,5 @@ type = "::FlexFlow::SparseCategoricalCrossEntropyLossAttrs"
 key = "sparse_categorical_ce_loss_attrs"
 
 [[values]]
-type = "::FlexFlow::OtherLossAttrs"
-key = "other_loss_attrs"
+type = "::FlexFlow::NonconfigurableLossAttrs"
+key = "nonconfigurable_loss_attrs"
diff --git a/lib/op-attrs/include/op-attrs/ops/loss_functions.h b/lib/op-attrs/include/op-attrs/ops/loss_functions.h
index 9fb0597197..74d2d0a479 100644
--- a/lib/op-attrs/include/op-attrs/ops/loss_functions.h
+++ b/lib/op-attrs/include/op-attrs/ops/loss_functions.h
@@ -1,11 +1,11 @@
 #ifndef _FLEXFLOW_OP_ATTRS_INCLUDE_OP_ATTRS_OPS_LOSS_FUNCTIONS_H
 #define _FLEXFLOW_OP_ATTRS_INCLUDE_OP_ATTRS_OPS_LOSS_FUNCTIONS_H
 
-#include "core.h"
-#include "loss_attrs.dtg.h"
-#include "loss_function.dtg.h"
-#include "other_loss_attrs.dtg.h"
-#include "sparse_categorical_ce_loss_attrs.dtg.h"
+#include "op-attrs/ops/core.h"
+#include "op-attrs/ops/loss_attrs.dtg.h"
+#include "op-attrs/ops/loss_function.dtg.h"
+#include "op-attrs/ops/nonconfigurable_loss_attrs.dtg.h"
+#include "op-attrs/ops/sparse_categorical_ce_loss_attrs.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/op-attrs/include/op-attrs/ops/other_loss_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/nonconfigurable_loss_attrs.struct.toml
similarity index 86%
rename from lib/op-attrs/include/op-attrs/ops/other_loss_attrs.struct.toml
rename to lib/op-attrs/include/op-attrs/ops/nonconfigurable_loss_attrs.struct.toml
index 81055f5835..0420e7ef7b 100644
--- a/lib/op-attrs/include/op-attrs/ops/other_loss_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/nonconfigurable_loss_attrs.struct.toml
@@ -1,5 +1,5 @@
 namespace = "FlexFlow"
-name = "OtherLossAttrs"
+name = "NonconfigurableLossAttrs"
 features = [
   "eq",
   "ord",
diff --git a/lib/op-attrs/src/loss_functions.cc b/lib/op-attrs/src/loss_functions.cc
index cae88be453..50a26ec792 100644
--- a/lib/op-attrs/src/loss_functions.cc
+++ b/lib/op-attrs/src/loss_functions.cc
@@ -12,7 +12,7 @@ LossFunction get_loss_function(LossAttrs const &attrs) {
       overload{[&](SparseCategoricalCrossEntropyLossAttrs const &s) {
                  return LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY;
                },
-               [&](OtherLossAttrs const &s) { return s.loss_type; }});
+               [&](NonconfigurableLossAttrs const &s) { return s.loss_type; }});
 }
 
 LossFunction parse_loss_name(std::string const &raw_name) {
diff --git a/lib/pcg/include/pcg/optimizer_attrs.h b/lib/pcg/include/pcg/optimizer_attrs.h
deleted file mode 100644
index 550bf12cc8..0000000000
--- a/lib/pcg/include/pcg/optimizer_attrs.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef _FLEXFLOW_LIB_PCG_INCLUDE_PCG_OPTIMIZER_ATTRS_H
-#define _FLEXFLOW_LIB_PCG_INCLUDE_PCG_OPTIMIZER_ATTRS_H
-
-#include "pcg/optimizer_attrs.dtg.h"
-
-namespace FlexFlow {
-
-OptimizerAttrs make_empty_sgd_attrs();
-OptimizerAttrs make_empty_adam_attrs();
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/pcg/src/pcg/optimizer_attrs.cc b/lib/pcg/src/pcg/optimizer_attrs.cc
deleted file mode 100644
index d51070b10d..0000000000
--- a/lib/pcg/src/pcg/optimizer_attrs.cc
+++ /dev/null
@@ -1,14 +0,0 @@
-#include "pcg/optimizer_attrs.h"
-
-namespace FlexFlow {
-
-OptimizerAttrs make_empty_sgd_attrs() {
-  return OptimizerAttrs{SGDOptimizerAttrs{0.0, 0.0, false, 0.0}};
-}
-
-OptimizerAttrs make_empty_adam_attrs() {
-  return OptimizerAttrs{
-      AdamOptimizerAttrs{0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}};
-}
-
-} // namespace FlexFlow

From 103ef073a4eedd0108ac6537541d5e4d2f6a03d9 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Wed, 11 Sep 2024 12:59:33 -0700
Subject: [PATCH 11/91] Format

---
 .../include/local-execution/optimizer.h       |  7 ++--
 .../include/local-execution/task_signature.h  |  4 +--
 .../src/init_op_task_impl_function.cc         | 18 ++++++----
 .../src/local_slots_backing.cc                | 35 +++++++++----------
 lib/local-execution/src/optimizer.cc          | 34 ++++++++----------
 lib/local-execution/test/src/test_loss_e2e.cc | 10 +++---
 .../test/src/test_update_e2e.cc               |  4 +--
 7 files changed, 57 insertions(+), 55 deletions(-)

diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h
index e1f11b8a68..a6395a4daa 100644
--- a/lib/local-execution/include/local-execution/optimizer.h
+++ b/lib/local-execution/include/local-execution/optimizer.h
@@ -11,9 +11,10 @@
 namespace FlexFlow {
 
 TaskSignature get_update_signature(OptimizerAttrs const &);
-TaskInvocation get_update_invocation(OptimizerAttrs const &,
-                                     tensor_guid_t const &weight,
-                                     std::vector<tensor_guid_t> const &grad_buffer_tensors);
+TaskInvocation get_update_invocation(
+    OptimizerAttrs const &,
+    tensor_guid_t const &weight,
+    std::vector<tensor_guid_t> const &grad_buffer_tensors);
 TaskImplFunction get_update_task_impl(OptimizerAttrs const &);
 
 TaskSignature get_sgd_update_signature();
diff --git a/lib/local-execution/include/local-execution/task_signature.h b/lib/local-execution/include/local-execution/task_signature.h
index ed28f8eaea..6da69f2441 100644
--- a/lib/local-execution/include/local-execution/task_signature.h
+++ b/lib/local-execution/include/local-execution/task_signature.h
@@ -35,7 +35,7 @@ void add_return_value(TaskSignature &task_signature) {
 
 /**
  * @brief Adds an argument slot without checking if it is serializable.
- * 
+ *
  * This function is used for arguments that are device-specific.
  */
 
@@ -46,7 +46,7 @@ void add_unchecked_arg_slot(TaskSignature &task_signature, int name) {
 
 /**
  * @brief Adds an argument slot without checking if it is serializable.
- * 
+ *
  * This function is used for arguments that are device-specific.
  */
 
diff --git a/lib/local-execution/src/init_op_task_impl_function.cc b/lib/local-execution/src/init_op_task_impl_function.cc
index 1c946982f5..abe84b828e 100644
--- a/lib/local-execution/src/init_op_task_impl_function.cc
+++ b/lib/local-execution/src/init_op_task_impl_function.cc
@@ -2,27 +2,33 @@
 
 namespace FlexFlow {
 
-bool InitOpTaskImplFunction::operator==(InitOpTaskImplFunction const &other) const {
+bool InitOpTaskImplFunction::operator==(
+    InitOpTaskImplFunction const &other) const {
   return this->function_ptr == other.function_ptr;
 }
 
-bool InitOpTaskImplFunction::operator!=(InitOpTaskImplFunction const &other) const {
+bool InitOpTaskImplFunction::operator!=(
+    InitOpTaskImplFunction const &other) const {
   return this->function_ptr != other.function_ptr;
 }
 
-bool InitOpTaskImplFunction::operator<(InitOpTaskImplFunction const &other) const {
+bool InitOpTaskImplFunction::operator<(
+    InitOpTaskImplFunction const &other) const {
   return this->function_ptr < other.function_ptr;
 }
 
-bool InitOpTaskImplFunction::operator>(InitOpTaskImplFunction const &other) const {
+bool InitOpTaskImplFunction::operator>(
+    InitOpTaskImplFunction const &other) const {
   return this->function_ptr > other.function_ptr;
 }
 
-bool InitOpTaskImplFunction::operator<=(InitOpTaskImplFunction const &other) const {
+bool InitOpTaskImplFunction::operator<=(
+    InitOpTaskImplFunction const &other) const {
   return this->function_ptr <= other.function_ptr;
 }
 
-bool InitOpTaskImplFunction::operator>=(InitOpTaskImplFunction const &other) const {
+bool InitOpTaskImplFunction::operator>=(
+    InitOpTaskImplFunction const &other) const {
   return this->function_ptr >= other.function_ptr;
 }
 
diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc
index 0a1497b6c8..7050063254 100644
--- a/lib/local-execution/src/local_slots_backing.cc
+++ b/lib/local-execution/src/local_slots_backing.cc
@@ -154,29 +154,28 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing(
 
 ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing(
     OpTaskBinding const &binding, layer_guid_t const &op_guid) const {
-  return map_values(binding.get_arg_bindings(), [&](auto const &arg_binding){
-    return arg_binding.template visit<ConcreteArgSpec>(overload{
-      [&](OpArgRefSpec const &s) {
-        return this->resolve_op_arg_ref_spec(s, op_guid);
-      },
-      [&](RuntimeArgRefSpec const &s) {
-        return this->resolve_runtime_arg_ref_spec(s);
-      },
-      [](ConcreteArgSpec const &s) { return s; }
-    });
+  return map_values(binding.get_arg_bindings(), [&](auto const &arg_binding) {
+    return arg_binding.template visit<ConcreteArgSpec>(
+        overload{[&](OpArgRefSpec const &s) {
+                   return this->resolve_op_arg_ref_spec(s, op_guid);
+                 },
+                 [&](RuntimeArgRefSpec const &s) {
+                   return this->resolve_runtime_arg_ref_spec(s);
+                 },
+                 [](ConcreteArgSpec const &s) { return s; }});
   });
 }
 
 ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing(
     TaskBinding const &binding) const {
-  return map_values(binding.get_arg_bindings(), [&](auto const &arg_binding){
-    return arg_binding.template visit<ConcreteArgSpec>(overload{
-      [&](RuntimeArgRefSpec const &s) {
-        return this->resolve_runtime_arg_ref_spec(s);
-      },
-      [](ConcreteArgSpec const &s) { return s; }
-    });
-  });;
+  return map_values(binding.get_arg_bindings(), [&](auto const &arg_binding) {
+    return arg_binding.template visit<ConcreteArgSpec>(
+        overload{[&](RuntimeArgRefSpec const &s) {
+                   return this->resolve_runtime_arg_ref_spec(s);
+                 },
+                 [](ConcreteArgSpec const &s) { return s; }});
+  });
+  ;
 }
 
 ConcreteArgSpec LocalSlotsBacking::resolve_op_arg_ref_spec(
diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc
index 485955a5dc..29beb15edf 100644
--- a/lib/local-execution/src/optimizer.cc
+++ b/lib/local-execution/src/optimizer.cc
@@ -35,7 +35,7 @@ TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs,
   if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
     b.bind_arg(HANDLE, ff_handle());
     return {task_id_t::SGD_UPD_NCCL_TASK_ID, b};
-  } else  {
+  } else {
     return {task_id_t::SGD_UPD_PS_TASK_ID, b};
   }
 }
@@ -185,31 +185,27 @@ TaskImplFunction get_adam_update_task_impl() {
 TaskSignature get_update_signature(OptimizerAttrs const &attrs) {
   return attrs.visit<TaskSignature>(overload{
       [&](SGDOptimizerAttrs const &) { return get_sgd_update_signature(); },
-      [&](AdamOptimizerAttrs const &) {
-        return get_adam_update_signature();
-      }});
+      [&](AdamOptimizerAttrs const &) { return get_adam_update_signature(); }});
 }
 
-TaskInvocation
-    get_update_invocation(OptimizerAttrs const &attrs,
-                          tensor_guid_t const &weight,
-                          std::vector<tensor_guid_t> const &grad_buffer_tensors) {
-  return attrs.visit<TaskInvocation>(
-      overload{[&](SGDOptimizerAttrs const &s) {
-                 return sgd_update(s, weight, grad_buffer_tensors.at(0));
-               },
-               [&](AdamOptimizerAttrs const &s) {
-                 return adam_update(
-                     s, weight, grad_buffer_tensors.at(0), grad_buffer_tensors.at(1));
-               }});
+TaskInvocation get_update_invocation(
+    OptimizerAttrs const &attrs,
+    tensor_guid_t const &weight,
+    std::vector<tensor_guid_t> const &grad_buffer_tensors) {
+  return attrs.visit<TaskInvocation>(overload{
+      [&](SGDOptimizerAttrs const &s) {
+        return sgd_update(s, weight, grad_buffer_tensors.at(0));
+      },
+      [&](AdamOptimizerAttrs const &s) {
+        return adam_update(
+            s, weight, grad_buffer_tensors.at(0), grad_buffer_tensors.at(1));
+      }});
 }
 
 TaskImplFunction get_update_task_impl(OptimizerAttrs const &attrs) {
   return attrs.visit<TaskImplFunction>(overload{
       [&](SGDOptimizerAttrs const &) { return get_sgd_update_task_impl(); },
-      [&](AdamOptimizerAttrs const &) {
-        return get_adam_update_task_impl();
-      }});
+      [&](AdamOptimizerAttrs const &) { return get_adam_update_task_impl(); }});
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc
index 740c2a7355..6cc66032ff 100644
--- a/lib/local-execution/test/src/test_loss_e2e.cc
+++ b/lib/local-execution/test/src/test_loss_e2e.cc
@@ -19,11 +19,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         EnableProfiling::NO,
         ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/0}};
 
-    OptimizerAttrs optimizer_attrs = OptimizerAttrs{SGDOptimizerAttrs{
-      /*lr=*/0.0, 
-      /*momentum=*/0.0, 
-      /*nesterov=*/false, 
-      /*weight_decay=*/0.0}};
+    OptimizerAttrs optimizer_attrs =
+        OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.0,
+                                         /*momentum=*/0.0,
+                                         /*nesterov=*/false,
+                                         /*weight_decay=*/0.0}};
 
     // construct graph
     ComputationGraphBuilder cg_builder;
diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc
index 6ad59c8286..f300fe0720 100644
--- a/lib/local-execution/test/src/test_update_e2e.cc
+++ b/lib/local-execution/test/src/test_update_e2e.cc
@@ -106,8 +106,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
                                             /*epsilon=*/1e-8}};
       std::optional<ModelTrainingInstance> model_training_instance =
           ModelTrainingInstance{
-              LossAttrs{
-                  NonconfigurableLossAttrs{LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}},
+              LossAttrs{NonconfigurableLossAttrs{
+                  LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}},
               label_tensor,
               logit_tensor,
               optimizer_attrs};

From f48f9ff97022910e69e0711b3cc0155db23da5bb Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Tue, 17 Sep 2024 17:43:22 -0700
Subject: [PATCH 12/91] Fix test and small issues

---
 lib/kernels/include/kernels/array_shape.h     |  1 +
 lib/kernels/include/kernels/profiling.h       |  1 +
 lib/kernels/src/array_shape.cc                | 28 +++-----
 .../local-execution/local_slots_backing.h     |  2 +-
 .../include/local-execution/loss_functions.h  |  2 +-
 .../include/local-execution/optimizer.h       |  2 +-
 .../include/local-execution/task_binding.h    | 58 +++++++++++++++++
 .../include/local-execution/task_invocation.h | 65 +------------------
 .../task_invocation.struct.toml               | 19 ++++++
 .../src/local_slots_backing.cc                |  4 +-
 .../src/local_training_backing.cc             |  5 +-
 lib/local-execution/src/loss_functions.cc     |  6 +-
 lib/local-execution/src/optimizer.cc          |  9 +--
 lib/local-execution/src/task_binding.cc       | 44 +++++++++++++
 lib/local-execution/src/task_invocation.cc    | 39 -----------
 lib/local-execution/test/src/test_loss_e2e.cc |  6 +-
 .../test/src/test_update_e2e.cc               |  6 +-
 17 files changed, 157 insertions(+), 140 deletions(-)
 create mode 100644 lib/local-execution/include/local-execution/task_binding.h
 create mode 100644 lib/local-execution/include/local-execution/task_invocation.struct.toml
 create mode 100644 lib/local-execution/src/task_binding.cc

diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h
index 6b0b57b57f..015cacc7cb 100644
--- a/lib/kernels/include/kernels/array_shape.h
+++ b/lib/kernels/include/kernels/array_shape.h
@@ -17,6 +17,7 @@ struct ArrayShape {
   ArrayShape(size_t *dims, size_t num_dims);
   ArrayShape(TensorShape const &shape);
   ArrayShape(std::vector<std::size_t> const &);
+  ArrayShape(LegionTensorDims const &);
 
   /**
    * @brief Alias of ArrayShape::num_elements for compatibility with
diff --git a/lib/kernels/include/kernels/profiling.h b/lib/kernels/include/kernels/profiling.h
index 655d540685..31c70010a0 100644
--- a/lib/kernels/include/kernels/profiling.h
+++ b/lib/kernels/include/kernels/profiling.h
@@ -40,6 +40,7 @@ std::optional<float> profiling_wrapper(F const &f,
   }
 
   float elapsed = 0;
+  std::cout << "hello";
   checkCUDA(ffEventRecord(t_end, stream));
   checkCUDA(ffEventSynchronize(t_end));
   checkCUDA(ffEventElapsedTime(&elapsed, t_start, t_end));
diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
index 054e16e90a..8464212290 100644
--- a/lib/kernels/src/array_shape.cc
+++ b/lib/kernels/src/array_shape.cc
@@ -1,4 +1,5 @@
 #include "kernels/array_shape.h"
+#include "op-attrs/dim_ordered/slice.h"
 #include "utils/containers/product.h"
 
 namespace FlexFlow {
@@ -19,6 +20,9 @@ ArrayShape::ArrayShape(TensorShape const &shape)
 ArrayShape::ArrayShape(std::vector<std::size_t> const &input_dims)
     : dims(input_dims) {}
 
+ArrayShape::ArrayShape(LegionTensorDims const &legion_tensor_dims)
+    : dims(legion_tensor_dims) {}
+
 std::size_t ArrayShape::get_volume() const {
   return this->num_elements();
 }
@@ -51,33 +55,19 @@ std::size_t ArrayShape::at(ff_dim_t idx) const {
 }
 
 ArrayShape ArrayShape::sub_shape(legion_dim_t start, ff_dim_t end) const {
-  NOT_IMPLEMENTED();
+  legion_dim_t legion_end = legion_dim_from_ff_dim(end, num_dims());
+  return this->sub_shape(start, legion_end);
 }
 
 ArrayShape ArrayShape::sub_shape(std::optional<ff_dim_t> start,
                                  std::optional<ff_dim_t> end) const {
-  std::vector<size_t> new_shape;
-  ff_dim_t start_idx = start.value_or(ff_dim_t{0});
-  ff_dim_t end_idx = end.value_or(ff_dim_t{this->num_dims()});
-
-  while (start_idx < end_idx) {
-    new_shape.push_back(this->at(start_idx));
-    start_idx = ff_dim_t{start_idx.value + 1};
-  }
-  return ArrayShape{new_shape};
+  return ArrayShape{legion_dims_from_ff_dims(
+      slice(ff_ordered_from_legion_ordered(this->dims), start, end))};
 }
 
 ArrayShape ArrayShape::sub_shape(std::optional<legion_dim_t> start,
                                  std::optional<legion_dim_t> end) const {
-  std::vector<size_t> new_shape;
-  legion_dim_t start_idx = start.value_or(legion_dim_t{0});
-  legion_dim_t end_idx = end.value_or(legion_dim_t{this->num_dims()});
-
-  while (start_idx < end_idx) {
-    new_shape.push_back(this->at(start_idx));
-    start_idx = add_to_legion_dim(start_idx, 1);
-  }
-  return ArrayShape{new_shape};
+  return ArrayShape{slice(this->dims, start, end)};
 }
 
 std::optional<std::size_t> ArrayShape::at_maybe(legion_dim_t index) const {
diff --git a/lib/local-execution/include/local-execution/local_slots_backing.h b/lib/local-execution/include/local-execution/local_slots_backing.h
index 439113c873..678be4c96b 100644
--- a/lib/local-execution/include/local-execution/local_slots_backing.h
+++ b/lib/local-execution/include/local-execution/local_slots_backing.h
@@ -7,7 +7,7 @@
 #include "local-execution/op_task_invocation.h"
 #include "local-execution/per_device_op_state.h"
 #include "local-execution/runtime_arg_config.h"
-#include "local-execution/task_invocation.h"
+#include "local-execution/task_invocation.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/loss_functions.h b/lib/local-execution/include/local-execution/loss_functions.h
index 58405536d8..2298115d5d 100644
--- a/lib/local-execution/include/local-execution/loss_functions.h
+++ b/lib/local-execution/include/local-execution/loss_functions.h
@@ -17,7 +17,7 @@
 #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_
 
 #include "local-execution/task_impl_function.dtg.h"
-#include "local-execution/task_invocation.h"
+#include "local-execution/task_invocation.dtg.h"
 #include "local-execution/task_signature.h"
 #include "op-attrs/ops/loss_functions.h"
 
diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h
index a6395a4daa..1e2cd65362 100644
--- a/lib/local-execution/include/local-execution/optimizer.h
+++ b/lib/local-execution/include/local-execution/optimizer.h
@@ -2,7 +2,7 @@
 #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_
 
 #include "local-execution/task_impl_function.dtg.h"
-#include "local-execution/task_invocation.h"
+#include "local-execution/task_invocation.dtg.h"
 #include "local-execution/task_signature.h"
 #include "pcg/optimizer_attrs.dtg.h"
 #include "pcg/optimizers/adam_optimizer_attrs.dtg.h"
diff --git a/lib/local-execution/include/local-execution/task_binding.h b/lib/local-execution/include/local-execution/task_binding.h
new file mode 100644
index 0000000000..cbe210f438
--- /dev/null
+++ b/lib/local-execution/include/local-execution/task_binding.h
@@ -0,0 +1,58 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H
+#define _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H
+
+#include "local-execution/slot_grad_id.dtg.h"
+#include "local-execution/slot_id_t.dtg.h"
+#include "local-execution/task_arg_spec.dtg.h"
+#include "local-execution/task_id_t.dtg.h"
+#include "local-execution/task_signature.dtg.h"
+#include "local-execution/tensor_guid_spec.dtg.h"
+
+namespace FlexFlow {
+
+struct TaskBinding {
+  TaskBinding() = default;
+
+  void bind(int, TensorGuidSpec const &);
+  void bind(slot_id_t, TensorGuidSpec const &);
+
+  template <typename T>
+  void bind_arg(int name, T const &t) {
+    this->bind_arg<T>(slot_id_t{name}, t);
+  }
+
+  template <typename T>
+  void bind_arg(slot_id_t name, T const &t) {
+    this->insert_arg_spec(name, TaskArgSpec{ConcreteArgSpec::create(t)});
+  }
+
+  template <typename T>
+  void bind_arg(int name, RuntimeArgRef<T> const &t) {
+    this->bind_arg<T>(slot_id_t{name}, t);
+  }
+
+  template <typename T>
+  void bind_arg(slot_id_t name, RuntimeArgRef<T> const &ref) {
+    this->insert_arg_spec(name, TaskArgSpec{RuntimeArgRefSpec::create(ref)});
+  }
+
+  bool operator==(TaskBinding const &other) const;
+  bool operator!=(TaskBinding const &other) const;
+
+  std::unordered_map<SlotGradId, TensorGuidSpec> const &
+      get_tensor_bindings() const;
+  std::unordered_map<slot_id_t, TaskArgSpec> const &get_arg_bindings() const;
+
+private:
+  std::unordered_map<SlotGradId, TensorGuidSpec> tensor_bindings;
+  std::unordered_map<slot_id_t, TaskArgSpec> arg_bindings;
+
+private:
+  void insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec);
+  std::tuple<decltype(tensor_bindings) const &, decltype(arg_bindings) const &>
+      tie() const;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/local-execution/include/local-execution/task_invocation.h b/lib/local-execution/include/local-execution/task_invocation.h
index 2317c65c02..93b5743a80 100644
--- a/lib/local-execution/include/local-execution/task_invocation.h
+++ b/lib/local-execution/include/local-execution/task_invocation.h
@@ -1,71 +1,12 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_INVOCATION_H
 #define _FLEXFLOW_LOCAL_EXECUTION_TASK_INVOCATION_H
 
-#include "local-execution/slot_grad_id.dtg.h"
-#include "local-execution/slot_id_t.dtg.h"
-#include "local-execution/task_arg_spec.dtg.h"
-#include "local-execution/task_id_t.dtg.h"
-#include "local-execution/task_signature.dtg.h"
-#include "local-execution/tensor_guid_spec.dtg.h"
+#include "local-execution/task_invocation.dtg.h"
 
 namespace FlexFlow {
 
-struct TaskBinding {
-  TaskBinding() = default;
-
-  void bind(int, TensorGuidSpec const &);
-  void bind(slot_id_t, TensorGuidSpec const &);
-
-  template <typename T>
-  void bind_arg(int name, T const &t) {
-    this->bind_arg<T>(slot_id_t{name}, t);
-  }
-
-  template <typename T>
-  void bind_arg(slot_id_t name, T const &t) {
-    this->insert_arg_spec(name, TaskArgSpec{ConcreteArgSpec::create(t)});
-  }
-
-  template <typename T>
-  void bind_arg(int name, RuntimeArgRef<T> const &t) {
-    this->bind_arg<T>(slot_id_t{name}, t);
-  }
-
-  template <typename T>
-  void bind_arg(slot_id_t name, RuntimeArgRef<T> const &ref) {
-    this->insert_arg_spec(name, TaskArgSpec{RuntimeArgRefSpec::create(ref)});
-  }
-
-  bool operator==(TaskBinding const &other) const;
-  bool operator!=(TaskBinding const &other) const;
-
-  std::unordered_map<SlotGradId, TensorGuidSpec> const &
-      get_tensor_bindings() const;
-  std::unordered_map<slot_id_t, TaskArgSpec> const &get_arg_bindings() const;
-
-private:
-  std::unordered_map<SlotGradId, TensorGuidSpec> tensor_bindings;
-  std::unordered_map<slot_id_t, TaskArgSpec> arg_bindings;
-
-private:
-  void insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec);
-  std::tuple<decltype(tensor_bindings) const &, decltype(arg_bindings) const &>
-      tie() const;
-};
-
-struct TaskInvocation {
-public:
-  TaskInvocation() = delete;
-  TaskInvocation(task_id_t task_id, TaskBinding const &binding)
-      : task_id(task_id), binding(binding) {}
-
-public:
-  task_id_t task_id;
-  TaskBinding binding;
-};
-
 bool is_invocation_valid(TaskSignature const &sig, TaskInvocation const &inv);
-
-} // namespace FlexFlow
+  
+}
 
 #endif
diff --git a/lib/local-execution/include/local-execution/task_invocation.struct.toml b/lib/local-execution/include/local-execution/task_invocation.struct.toml
new file mode 100644
index 0000000000..abcaabda93
--- /dev/null
+++ b/lib/local-execution/include/local-execution/task_invocation.struct.toml
@@ -0,0 +1,19 @@
+namespace = "FlexFlow"
+name = "TaskInvocation"
+features = [
+  "eq"
+]
+
+includes = [
+  "local-execution/task_binding.h",
+  "local-execution/task_id_t.dtg.h"
+]
+
+
+[[fields]]
+name = "task_id"
+type = "::FlexFlow::task_id_t"
+
+[[fields]]
+name = "binding"
+type = "::FlexFlow::TaskBinding"
diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc
index 7050063254..194d64c34b 100644
--- a/lib/local-execution/src/local_slots_backing.cc
+++ b/lib/local-execution/src/local_slots_backing.cc
@@ -154,7 +154,7 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing(
 
 ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing(
     OpTaskBinding const &binding, layer_guid_t const &op_guid) const {
-  return map_values(binding.get_arg_bindings(), [&](auto const &arg_binding) {
+  return map_values(binding.get_arg_bindings(), [&](OpArgSpec const &arg_binding) {
     return arg_binding.template visit<ConcreteArgSpec>(
         overload{[&](OpArgRefSpec const &s) {
                    return this->resolve_op_arg_ref_spec(s, op_guid);
@@ -168,7 +168,7 @@ ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing(
 
 ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing(
     TaskBinding const &binding) const {
-  return map_values(binding.get_arg_bindings(), [&](auto const &arg_binding) {
+  return map_values(binding.get_arg_bindings(), [&](TaskArgSpec const &arg_binding) {
     return arg_binding.template visit<ConcreteArgSpec>(
         overload{[&](RuntimeArgRefSpec const &s) {
                    return this->resolve_runtime_arg_ref_spec(s);
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index dff33826b9..7f0b179390 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -3,6 +3,7 @@
 #include "local-execution/model_training_instance.h"
 #include "local-execution/optimizer.h"
 #include "local-execution/task_signature_impl.h"
+#include "local-execution/task_invocation.h"
 #include "utils/containers/contains.h"
 #include "utils/containers/contains_key.h"
 #include "utils/containers/get_only.h"
@@ -124,7 +125,7 @@ PerLayerElapsedTime LocalTrainingBacking::execute_backward() {
         backward(unwrapped_training_instance.loss_attrs,
                  unwrapped_training_instance.logit_tensor,
                  unwrapped_training_instance.label_tensor);
-    assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation));
+    // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation));
     TaskArgumentAccessor loss_accessor =
         this->get_task_arg_accessor(loss_invocation);
     TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl();
@@ -167,7 +168,7 @@ void LocalTrainingBacking::execute_update() {
       // get invocation
       TaskInvocation invocation =
           get_update_invocation(attrs, weight_tensor, grad_buffer_tensors);
-      assert(is_invocation_valid(get_update_signature(attrs), invocation));
+      // assert(is_invocation_valid(get_update_signature(attrs), invocation));
 
       // execute update
       TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation);
diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc
index 771d175a7d..3a4c616377 100644
--- a/lib/local-execution/src/loss_functions.cc
+++ b/lib/local-execution/src/loss_functions.cc
@@ -41,7 +41,7 @@ TaskInvocation
   b.bind_arg(ATTRS, attrs);
   b.bind_arg(PROFILING, profiling_settings());
 
-  return {task_id_t::LOSS_BWD_TASK_ID, b};
+  return TaskInvocation{task_id_t::LOSS_BWD_TASK_ID, b};
 }
 
 static void backward_task_impl(TaskArgumentAccessor const &acc) {
@@ -51,7 +51,7 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
   auto logit = acc.get_tensor<Permissions::RO>(LOGIT);
   auto label = acc.get_tensor<Permissions::RO>(LABEL);
   int batch_size = logit.shape.at(legion_dim_t{1});
-  // assuming logit shape is [parallel dim(?), batch dim, num classes]
+  // assuming logit shape is [batch dim, num classes]
 
   LossFunction loss_type = get_loss_function(attrs);
   float scale_factor = 1.0f / batch_size;
@@ -61,7 +61,7 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
   }
 
   if (loss_type == LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY) {
-    // label shape is [parallel dim(?), batch dim, 1]
+    // label shape is [batch dim, 1]
     auto scce_attrs = attrs.get<SparseCategoricalCrossEntropyLossAttrs>();
     size_t ndim = logit.shape.num_dims();
     int num_classes = logit.shape.at(legion_dim_t{0});
diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc
index 29beb15edf..30f20bf8ec 100644
--- a/lib/local-execution/src/optimizer.cc
+++ b/lib/local-execution/src/optimizer.cc
@@ -34,9 +34,9 @@ TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs,
 
   if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
     b.bind_arg(HANDLE, ff_handle());
-    return {task_id_t::SGD_UPD_NCCL_TASK_ID, b};
+    return TaskInvocation{task_id_t::SGD_UPD_NCCL_TASK_ID, b};
   } else {
-    return {task_id_t::SGD_UPD_PS_TASK_ID, b};
+    return TaskInvocation{task_id_t::SGD_UPD_PS_TASK_ID, b};
   }
 }
 
@@ -123,9 +123,10 @@ TaskInvocation adam_update(AdamOptimizerAttrs const &attrs,
 
   if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
     b.bind_arg(HANDLE, ff_handle());
-    return {task_id_t::ADAM_UPD_NCCL_TASK_ID, b};
+    return TaskInvocation{task_id_t::ADAM_UPD_NCCL_TASK_ID, b};
+  } else {
+    return TaskInvocation{task_id_t::ADAM_UPD_PS_TASK_ID, b};
   }
-  return {task_id_t::ADAM_UPD_PS_TASK_ID, b};
 }
 
 static void adam_update_task_impl(TaskArgumentAccessor const &acc) {
diff --git a/lib/local-execution/src/task_binding.cc b/lib/local-execution/src/task_binding.cc
new file mode 100644
index 0000000000..a5a3b2dc34
--- /dev/null
+++ b/lib/local-execution/src/task_binding.cc
@@ -0,0 +1,44 @@
+#include "local-execution/task_binding.h"
+#include "utils/containers/contains_key.h"
+
+namespace FlexFlow {
+
+void TaskBinding::bind(int name, TensorGuidSpec const &tensor_guid_spec) {
+  this->bind(slot_id_t{name}, tensor_guid_spec);
+}
+
+void TaskBinding::bind(slot_id_t name, TensorGuidSpec const &tensor_guid_spec) {
+  this->tensor_bindings.insert(
+      {SlotGradId{name, tensor_guid_spec.is_grad}, tensor_guid_spec});
+}
+
+void TaskBinding::insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec) {
+  assert(!contains_key(this->arg_bindings, name));
+  this->arg_bindings.insert({name, arg_spec});
+}
+
+bool TaskBinding::operator==(TaskBinding const &other) const {
+  return this->tie() == other.tie();
+}
+
+bool TaskBinding::operator!=(TaskBinding const &other) const {
+  return this->tie() != other.tie();
+}
+
+std::tuple<std::unordered_map<SlotGradId, TensorGuidSpec> const &,
+           std::unordered_map<slot_id_t, TaskArgSpec> const &>
+    TaskBinding::tie() const {
+  return std::tie(this->tensor_bindings, this->arg_bindings);
+}
+
+std::unordered_map<SlotGradId, TensorGuidSpec> const &
+    TaskBinding::get_tensor_bindings() const {
+  return this->tensor_bindings;
+}
+
+std::unordered_map<slot_id_t, TaskArgSpec> const &
+    TaskBinding::get_arg_bindings() const {
+  return this->arg_bindings;
+}
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/task_invocation.cc b/lib/local-execution/src/task_invocation.cc
index e15b9ae4ef..e08c1036da 100644
--- a/lib/local-execution/src/task_invocation.cc
+++ b/lib/local-execution/src/task_invocation.cc
@@ -1,46 +1,7 @@
 #include "local-execution/task_invocation.h"
-#include "utils/containers/contains_key.h"
 
 namespace FlexFlow {
 
-void TaskBinding::bind(int name, TensorGuidSpec const &tensor_guid_spec) {
-  this->bind(slot_id_t{name}, tensor_guid_spec);
-}
-
-void TaskBinding::bind(slot_id_t name, TensorGuidSpec const &tensor_guid_spec) {
-  this->tensor_bindings.insert(
-      {SlotGradId{name, tensor_guid_spec.is_grad}, tensor_guid_spec});
-}
-
-void TaskBinding::insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec) {
-  assert(!contains_key(this->arg_bindings, name));
-  this->arg_bindings.insert({name, arg_spec});
-}
-
-bool TaskBinding::operator==(TaskBinding const &other) const {
-  return this->tie() == other.tie();
-}
-
-bool TaskBinding::operator!=(TaskBinding const &other) const {
-  return this->tie() != other.tie();
-}
-
-std::tuple<std::unordered_map<SlotGradId, TensorGuidSpec> const &,
-           std::unordered_map<slot_id_t, TaskArgSpec> const &>
-    TaskBinding::tie() const {
-  return std::tie(this->tensor_bindings, this->arg_bindings);
-}
-
-std::unordered_map<SlotGradId, TensorGuidSpec> const &
-    TaskBinding::get_tensor_bindings() const {
-  return this->tensor_bindings;
-}
-
-std::unordered_map<slot_id_t, TaskArgSpec> const &
-    TaskBinding::get_arg_bindings() const {
-  return this->arg_bindings;
-}
-
 bool is_invocation_valid(TaskSignature const &sig, TaskInvocation const &inv) {
   NOT_IMPLEMENTED();
 }
diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc
index 6cc66032ff..3bc85354a0 100644
--- a/lib/local-execution/test/src/test_loss_e2e.cc
+++ b/lib/local-execution/test/src/test_loss_e2e.cc
@@ -4,7 +4,7 @@
 #include "kernels/managed_per_device_ff_handle.h"
 #include "local-execution/local_training_backing.h"
 #include "pcg/computation_graph_builder.h"
-#include "pcg/optimizer_attrs.h"
+#include "pcg/optimizer_attrs.dtg.h"
 #include "test_utils.h"
 
 namespace FlexFlow {
@@ -16,8 +16,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
         DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
-        EnableProfiling::NO,
-        ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/0}};
+        EnableProfiling::YES,
+        ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}};
 
     OptimizerAttrs optimizer_attrs =
         OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.0,
diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc
index f300fe0720..b5a503f430 100644
--- a/lib/local-execution/test/src/test_update_e2e.cc
+++ b/lib/local-execution/test/src/test_update_e2e.cc
@@ -4,7 +4,7 @@
 #include "kernels/managed_per_device_ff_handle.h"
 #include "local-execution/local_training_backing.h"
 #include "pcg/computation_graph_builder.h"
-#include "pcg/optimizer_attrs.h"
+#include "pcg/optimizer_attrs.dtg.h"
 #include "test_utils.h"
 
 namespace FlexFlow {
@@ -16,8 +16,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
         DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
-        EnableProfiling::NO,
-        ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/0}};
+        EnableProfiling::YES,
+        ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}};
 
     // construct graph
     ComputationGraphBuilder cg_builder;

From 189c9c8c034143cd4a5fc4bab0db652444601915 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Tue, 17 Sep 2024 17:43:37 -0700
Subject: [PATCH 13/91] Format

---
 .../include/local-execution/task_invocation.h |  2 +-
 .../src/local_slots_backing.cc                | 36 ++++++++++---------
 .../src/local_training_backing.cc             |  2 +-
 3 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/lib/local-execution/include/local-execution/task_invocation.h b/lib/local-execution/include/local-execution/task_invocation.h
index 93b5743a80..d03d6ac8e1 100644
--- a/lib/local-execution/include/local-execution/task_invocation.h
+++ b/lib/local-execution/include/local-execution/task_invocation.h
@@ -6,7 +6,7 @@
 namespace FlexFlow {
 
 bool is_invocation_valid(TaskSignature const &sig, TaskInvocation const &inv);
-  
+
 }
 
 #endif
diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc
index 194d64c34b..ff23c269e7 100644
--- a/lib/local-execution/src/local_slots_backing.cc
+++ b/lib/local-execution/src/local_slots_backing.cc
@@ -154,27 +154,29 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing(
 
 ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing(
     OpTaskBinding const &binding, layer_guid_t const &op_guid) const {
-  return map_values(binding.get_arg_bindings(), [&](OpArgSpec const &arg_binding) {
-    return arg_binding.template visit<ConcreteArgSpec>(
-        overload{[&](OpArgRefSpec const &s) {
-                   return this->resolve_op_arg_ref_spec(s, op_guid);
-                 },
-                 [&](RuntimeArgRefSpec const &s) {
-                   return this->resolve_runtime_arg_ref_spec(s);
-                 },
-                 [](ConcreteArgSpec const &s) { return s; }});
-  });
+  return map_values(
+      binding.get_arg_bindings(), [&](OpArgSpec const &arg_binding) {
+        return arg_binding.template visit<ConcreteArgSpec>(
+            overload{[&](OpArgRefSpec const &s) {
+                       return this->resolve_op_arg_ref_spec(s, op_guid);
+                     },
+                     [&](RuntimeArgRefSpec const &s) {
+                       return this->resolve_runtime_arg_ref_spec(s);
+                     },
+                     [](ConcreteArgSpec const &s) { return s; }});
+      });
 }
 
 ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing(
     TaskBinding const &binding) const {
-  return map_values(binding.get_arg_bindings(), [&](TaskArgSpec const &arg_binding) {
-    return arg_binding.template visit<ConcreteArgSpec>(
-        overload{[&](RuntimeArgRefSpec const &s) {
-                   return this->resolve_runtime_arg_ref_spec(s);
-                 },
-                 [](ConcreteArgSpec const &s) { return s; }});
-  });
+  return map_values(
+      binding.get_arg_bindings(), [&](TaskArgSpec const &arg_binding) {
+        return arg_binding.template visit<ConcreteArgSpec>(
+            overload{[&](RuntimeArgRefSpec const &s) {
+                       return this->resolve_runtime_arg_ref_spec(s);
+                     },
+                     [](ConcreteArgSpec const &s) { return s; }});
+      });
   ;
 }
 
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index 7f0b179390..9c1136f198 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -2,8 +2,8 @@
 #include "local-execution/loss_functions.h"
 #include "local-execution/model_training_instance.h"
 #include "local-execution/optimizer.h"
-#include "local-execution/task_signature_impl.h"
 #include "local-execution/task_invocation.h"
+#include "local-execution/task_signature_impl.h"
 #include "utils/containers/contains.h"
 #include "utils/containers/contains_key.h"
 #include "utils/containers/get_only.h"

From b5647c8336848f0030445c9254cfc0e07b88ef4f Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Tue, 1 Oct 2024 09:17:46 -0700
Subject: [PATCH 14/91] Pass tests after merge

---
 lib/kernels/include/kernels/profiling.h       |  1 -
 .../model_training_instance.struct.toml       |  2 +-
 .../src/local_training_backing.cc             |  2 +-
 .../test/src/test_local_cost_estimator.cc     |  2 +-
 lib/local-execution/test/src/test_loss_e2e.cc |  6 ++---
 .../test/src/test_update_e2e.cc               |  4 ++--
 .../op-attrs/ops/loss_attrs.variant.toml      | 22 ------------------
 .../op-attrs/ops/loss_function.enum.toml      | 23 -------------------
 .../include/op-attrs/ops/loss_functions.h     |  8 +++----
 .../loss_functions/loss_attrs.variant.toml    |  6 ++---
 .../ops/loss_functions/loss_functions.h       | 20 ----------------
 .../nonconfigurable_loss_attrs.struct.toml    |  2 +-
 .../other_loss_attrs.struct.toml              | 18 ---------------
 ...arse_categorical_ce_loss_attrs.struct.toml | 14 -----------
 .../src/op-attrs/ops/loss_functions.cc        |  2 +-
 15 files changed, 17 insertions(+), 115 deletions(-)
 delete mode 100644 lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml
 delete mode 100644 lib/op-attrs/include/op-attrs/ops/loss_function.enum.toml
 delete mode 100644 lib/op-attrs/include/op-attrs/ops/loss_functions/loss_functions.h
 rename lib/op-attrs/include/op-attrs/ops/{ => loss_functions}/nonconfigurable_loss_attrs.struct.toml (80%)
 delete mode 100644 lib/op-attrs/include/op-attrs/ops/loss_functions/other_loss_attrs.struct.toml
 delete mode 100644 lib/op-attrs/include/op-attrs/ops/sparse_categorical_ce_loss_attrs.struct.toml

diff --git a/lib/kernels/include/kernels/profiling.h b/lib/kernels/include/kernels/profiling.h
index 31c70010a0..655d540685 100644
--- a/lib/kernels/include/kernels/profiling.h
+++ b/lib/kernels/include/kernels/profiling.h
@@ -40,7 +40,6 @@ std::optional<float> profiling_wrapper(F const &f,
   }
 
   float elapsed = 0;
-  std::cout << "hello";
   checkCUDA(ffEventRecord(t_end, stream));
   checkCUDA(ffEventSynchronize(t_end));
   checkCUDA(ffEventElapsedTime(&elapsed, t_start, t_end));
diff --git a/lib/local-execution/include/local-execution/model_training_instance.struct.toml b/lib/local-execution/include/local-execution/model_training_instance.struct.toml
index b460d6bd3a..28282e21c0 100644
--- a/lib/local-execution/include/local-execution/model_training_instance.struct.toml
+++ b/lib/local-execution/include/local-execution/model_training_instance.struct.toml
@@ -8,7 +8,7 @@ features = [
 ]
 
 includes = [
-  "op-attrs/ops/loss_attrs.dtg.h",
+  "op-attrs/ops/loss_functions/loss_attrs.dtg.h",
   "pcg/tensor_guid_t.dtg.h",
   "pcg/optimizer_attrs.dtg.h",
 ]
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index b794cc6da6..edbb377047 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -4,10 +4,10 @@
 #include "local-execution/optimizer.h"
 #include "local-execution/task_invocation.h"
 #include "local-execution/task_signature_impl.h"
+#include "pcg/computation_graph.h"
 #include "utils/containers/contains.h"
 #include "utils/containers/contains_key.h"
 #include "utils/containers/get_only.h"
-#include "pcg/computation_graph.h"
 #include "utils/containers/reversed.h"
 #include "utils/exception.h"
 
diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc
index 4c01df53e9..2b22d64969 100644
--- a/lib/local-execution/test/src/test_local_cost_estimator.cc
+++ b/lib/local-execution/test/src/test_local_cost_estimator.cc
@@ -31,7 +31,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
           /*kdim=*/embed_dim,
           /*vdim=*/embed_dim,
           /*dropout=*/0.0,
-          /*bias=*/true,
+          /*bias=*/false,
           /*add_bias_kv=*/false,
           /*add_zero_attn=*/false,
       };
diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc
index 3bc85354a0..4801aff6a9 100644
--- a/lib/local-execution/test/src/test_loss_e2e.cc
+++ b/lib/local-execution/test/src/test_loss_e2e.cc
@@ -33,7 +33,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     TensorShape input_shape = TensorShape{
         TensorDims{FFOrdered<size_t>{batch_size, data_dim}}, DataType::FLOAT};
     tensor_guid_t input_tensor =
-        cg_builder.create_tensor(input_shape, CreateGrad::YES);
+        cg_builder.create_input(input_shape, CreateGrad::YES);
 
     float scalar = 4.0;
     tensor_guid_t logit_tensor =
@@ -50,7 +50,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       TensorShape label_shape = TensorShape{
           TensorDims{FFOrdered<size_t>{batch_size, 1}}, DataType::FLOAT};
       tensor_guid_t label_tensor =
-          cg_builder.create_tensor(label_shape, CreateGrad::NO);
+          cg_builder.create_input(label_shape, CreateGrad::NO);
       GenericTensorAccessorW label_backing =
           allocator.allocate_tensor(label_shape);
       tensor_backing_map.insert({label_tensor, label_backing});
@@ -73,7 +73,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     SUBCASE("OtherAttrs") {
       tensor_guid_t label_tensor =
-          cg_builder.create_tensor(input_shape, CreateGrad::NO);
+          cg_builder.create_input(input_shape, CreateGrad::NO);
       GenericTensorAccessorW label_backing =
           allocator.allocate_tensor(input_shape);
       tensor_backing_map.insert({label_tensor, label_backing});
diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc
index b5a503f430..af4303fab8 100644
--- a/lib/local-execution/test/src/test_update_e2e.cc
+++ b/lib/local-execution/test/src/test_update_e2e.cc
@@ -27,7 +27,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     TensorShape input_shape = TensorShape{
         TensorDims{FFOrdered<size_t>{batch_size, data_dim}}, DataType::FLOAT};
     tensor_guid_t input_tensor =
-        cg_builder.create_tensor(input_shape, CreateGrad::YES);
+        cg_builder.create_input(input_shape, CreateGrad::YES);
 
     float scalar = 4.0;
     tensor_guid_t logit_tensor =
@@ -41,7 +41,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     tensor_backing_map.insert({input_tensor, input_backing});
 
     tensor_guid_t label_tensor =
-        cg_builder.create_tensor(input_shape, CreateGrad::NO);
+        cg_builder.create_input(input_shape, CreateGrad::NO);
     GenericTensorAccessorW label_backing =
         allocator.allocate_tensor(input_shape);
     tensor_backing_map.insert({label_tensor, label_backing});
diff --git a/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml b/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml
deleted file mode 100644
index d60c6507cf..0000000000
--- a/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml
+++ /dev/null
@@ -1,22 +0,0 @@
-namespace = "FlexFlow"
-name = "LossAttrs"
-features = [
-  "eq",
-  "ord",
-  "hash",
-  "json",
-  "fmt",
-]
-
-includes = [
-  "op-attrs/ops/sparse_categorical_ce_loss_attrs.dtg.h",
-  "op-attrs/ops/nonconfigurable_loss_attrs.dtg.h"
-]
-
-[[values]]
-type = "::FlexFlow::SparseCategoricalCrossEntropyLossAttrs"
-key = "sparse_categorical_ce_loss_attrs"
-
-[[values]]
-type = "::FlexFlow::NonconfigurableLossAttrs"
-key = "nonconfigurable_loss_attrs"
diff --git a/lib/op-attrs/include/op-attrs/ops/loss_function.enum.toml b/lib/op-attrs/include/op-attrs/ops/loss_function.enum.toml
deleted file mode 100644
index b9cd13eabf..0000000000
--- a/lib/op-attrs/include/op-attrs/ops/loss_function.enum.toml
+++ /dev/null
@@ -1,23 +0,0 @@
-namespace = "FlexFlow"
-name = "LossFunction"
-features = [
-  "hash",
-  "json",
-  "rapidcheck",
-  "fmt",
-]
-
-[[values]]
-name = "CATEGORICAL_CROSSENTROPY"
-
-[[values]]
-name = "SPARSE_CATEGORICAL_CROSSENTROPY"
-
-[[values]]
-name = "MEAN_SQUARED_ERROR_AVG_REDUCE"
-
-[[values]]
-name = "MEAN_SQUARED_ERROR_SUM_REDUCE"
-
-[[values]]
-name = "IDENTITY"
diff --git a/lib/op-attrs/include/op-attrs/ops/loss_functions.h b/lib/op-attrs/include/op-attrs/ops/loss_functions.h
index 74d2d0a479..657f8d91dc 100644
--- a/lib/op-attrs/include/op-attrs/ops/loss_functions.h
+++ b/lib/op-attrs/include/op-attrs/ops/loss_functions.h
@@ -2,10 +2,10 @@
 #define _FLEXFLOW_OP_ATTRS_INCLUDE_OP_ATTRS_OPS_LOSS_FUNCTIONS_H
 
 #include "op-attrs/ops/core.h"
-#include "op-attrs/ops/loss_attrs.dtg.h"
-#include "op-attrs/ops/loss_function.dtg.h"
-#include "op-attrs/ops/nonconfigurable_loss_attrs.dtg.h"
-#include "op-attrs/ops/sparse_categorical_ce_loss_attrs.dtg.h"
+#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
+#include "op-attrs/ops/loss_functions/loss_function.dtg.h"
+#include "op-attrs/ops/loss_functions/nonconfigurable_loss_attrs.dtg.h"
+#include "op-attrs/ops/loss_functions/sparse_categorical_cross_entropy_loss_attrs.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/op-attrs/include/op-attrs/ops/loss_functions/loss_attrs.variant.toml b/lib/op-attrs/include/op-attrs/ops/loss_functions/loss_attrs.variant.toml
index 17293095e4..943760d949 100644
--- a/lib/op-attrs/include/op-attrs/ops/loss_functions/loss_attrs.variant.toml
+++ b/lib/op-attrs/include/op-attrs/ops/loss_functions/loss_attrs.variant.toml
@@ -11,7 +11,7 @@ features = [
 
 includes = [
   "op-attrs/ops/loss_functions/sparse_categorical_cross_entropy_loss_attrs.dtg.h",
-  "op-attrs/ops/loss_functions/other_loss_attrs.dtg.h",
+  "op-attrs/ops/loss_functions/nonconfigurable_loss_attrs.dtg.h",
 ]
 
 [[values]]
@@ -19,5 +19,5 @@ type = "::FlexFlow::SparseCategoricalCrossEntropyLossAttrs"
 key = "sparse_categorical_cross_entropy_loss"
 
 [[values]]
-type = "::FlexFlow::OtherLossAttrs"
-key = "other_loss"
+type = "::FlexFlow::NonconfigurableLossAttrs"
+key = "nonconfigurable_loss_attrs"
diff --git a/lib/op-attrs/include/op-attrs/ops/loss_functions/loss_functions.h b/lib/op-attrs/include/op-attrs/ops/loss_functions/loss_functions.h
deleted file mode 100644
index ca8f3e6602..0000000000
--- a/lib/op-attrs/include/op-attrs/ops/loss_functions/loss_functions.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef _FLEXFLOW_OP_ATTRS_INCLUDE_OP_ATTRS_OPS_LOSS_FUNCTIONS_H
-#define _FLEXFLOW_OP_ATTRS_INCLUDE_OP_ATTRS_OPS_LOSS_FUNCTIONS_H
-
-#include "op-attrs/ops/core.h"
-#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
-#include "op-attrs/ops/loss_functions/loss_function.dtg.h"
-
-namespace FlexFlow {
-
-CHECK_VALID_OP_ATTR(LossAttrs);
-
-LossFunction parse_loss_function_name(std::string const &);
-
-LossFunction get_loss_function(OtherLossAttrs const &);
-LossFunction get_loss_function(SparseCategoricalCrossEntropyLossAttrs const &);
-LossFunction get_loss_function(LossAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/op-attrs/include/op-attrs/ops/nonconfigurable_loss_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/loss_functions/nonconfigurable_loss_attrs.struct.toml
similarity index 80%
rename from lib/op-attrs/include/op-attrs/ops/nonconfigurable_loss_attrs.struct.toml
rename to lib/op-attrs/include/op-attrs/ops/loss_functions/nonconfigurable_loss_attrs.struct.toml
index 0420e7ef7b..3fe7ac86c5 100644
--- a/lib/op-attrs/include/op-attrs/ops/nonconfigurable_loss_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/loss_functions/nonconfigurable_loss_attrs.struct.toml
@@ -10,7 +10,7 @@ features = [
 ]
 
 includes = [
-  "op-attrs/ops/loss_function.dtg.h"
+  "op-attrs/ops/loss_functions/loss_function.dtg.h"
 ]
 
 [[fields]]
diff --git a/lib/op-attrs/include/op-attrs/ops/loss_functions/other_loss_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/loss_functions/other_loss_attrs.struct.toml
deleted file mode 100644
index 284a4b1d7d..0000000000
--- a/lib/op-attrs/include/op-attrs/ops/loss_functions/other_loss_attrs.struct.toml
+++ /dev/null
@@ -1,18 +0,0 @@
-namespace = "FlexFlow"
-name = "OtherLossAttrs"
-features = [
-  "eq",
-  "ord",
-  "hash",
-  "fmt",
-  "rapidcheck",
-  "json",
-]
-
-includes = [
-  "op-attrs/ops/loss_functions/loss_function.dtg.h",
-]
-
-[[fields]]
-name = "loss_type"
-type = "::FlexFlow::LossFunction"
diff --git a/lib/op-attrs/include/op-attrs/ops/sparse_categorical_ce_loss_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/sparse_categorical_ce_loss_attrs.struct.toml
deleted file mode 100644
index 21378a1154..0000000000
--- a/lib/op-attrs/include/op-attrs/ops/sparse_categorical_ce_loss_attrs.struct.toml
+++ /dev/null
@@ -1,14 +0,0 @@
-namespace = "FlexFlow"
-name = "SparseCategoricalCrossEntropyLossAttrs"
-features = [
-  "eq",
-  "ord",
-  "hash",
-  "json",
-  "rapidcheck",
-  "fmt",
-]
-
-[[fields]]
-name = "replace_labels"
-type = "bool"
diff --git a/lib/op-attrs/src/op-attrs/ops/loss_functions.cc b/lib/op-attrs/src/op-attrs/ops/loss_functions.cc
index a5c6aeb2a5..2b9a7533f0 100644
--- a/lib/op-attrs/src/op-attrs/ops/loss_functions.cc
+++ b/lib/op-attrs/src/op-attrs/ops/loss_functions.cc
@@ -1,4 +1,4 @@
-#include "op-attrs/ops/loss_functions/loss_functions.h"
+#include "op-attrs/ops/loss_functions.h"
 #include "utils/containers/transform.h"
 #include "utils/exception.h"
 #include "utils/overload.h"

From f5ff91e9757a73c94d73dddaec2243b0c46c87ec Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Tue, 1 Oct 2024 10:49:02 -0700
Subject: [PATCH 15/91] Fix input/weight differentiation

---
 .../local-execution/local_slots_backing.h     |  2 ++
 .../src/local_slots_backing.cc                | 24 +++++++++----------
 .../test/src/test_local_slots_backing.cc      | 12 +++++++---
 3 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/lib/local-execution/include/local-execution/local_slots_backing.h b/lib/local-execution/include/local-execution/local_slots_backing.h
index 4c6dbacfe3..93c534e583 100644
--- a/lib/local-execution/include/local-execution/local_slots_backing.h
+++ b/lib/local-execution/include/local-execution/local_slots_backing.h
@@ -54,6 +54,8 @@ struct LocalSlotsBacking {
   TensorBackingMap gradient_tensor_mapping;
   std::unordered_map<layer_guid_t, std::vector<tensor_guid_t>>
       input_tensor_slots;
+  std::unordered_map<layer_guid_t, std::vector<tensor_guid_t>>
+      weight_tensor_slots;
   std::unordered_map<layer_guid_t, std::vector<tensor_guid_t>>
       output_tensor_slots;
   std::unordered_map<tensor_guid_t, std::vector<tensor_guid_t>>
diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc
index 93cfe4498c..bdbfa4f222 100644
--- a/lib/local-execution/src/local_slots_backing.cc
+++ b/lib/local-execution/src/local_slots_backing.cc
@@ -22,8 +22,10 @@ void LocalSlotsBacking::allocate_outgoing_tensors(
     layer_guid_t const &layer_guid,
     ComputationGraph const &computation_graph,
     Allocator &allocator) {
-  std::vector<tensor_guid_t> incoming_tensors =
-      get_incoming_tensors(computation_graph, layer_guid);
+  std::vector<tensor_guid_t> incoming_input_tensors =
+      get_incoming_inputs(computation_graph, layer_guid);
+  std::vector<tensor_guid_t> incoming_weight_tensors =
+      get_incoming_weights(computation_graph, layer_guid);
   std::vector<tensor_guid_t> outgoing_tensors =
       get_outgoing_tensors(computation_graph, layer_guid);
   for (tensor_guid_t const &output_tensor : outgoing_tensors) {
@@ -46,7 +48,8 @@ void LocalSlotsBacking::allocate_outgoing_tensors(
     }
   }
 
-  this->input_tensor_slots.insert({layer_guid, incoming_tensors});
+  this->input_tensor_slots.insert({layer_guid, incoming_input_tensors});
+  this->weight_tensor_slots.insert({layer_guid, incoming_weight_tensors});
   this->output_tensor_slots.insert({layer_guid, outgoing_tensors});
 }
 
@@ -100,13 +103,6 @@ GenericTensorAccessorW const &
 TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing(
     OpTaskBinding const &binding, layer_guid_t const &op_guid) const {
   TensorSlotsBacking mapping;
-  int num_inputs = 0;
-  for (auto const &tensor_binding : binding.get_tensor_bindings()) {
-    if (tensor_binding.first.is_grad == IsGrad::NO &&
-        tensor_binding.second.role == TensorRole::INPUT) {
-      num_inputs += 1;
-    }
-  }
 
   for (auto const &tensor_binding : binding.get_tensor_bindings()) {
     SlotGradId slot_grad_id = tensor_binding.first;
@@ -115,7 +111,9 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing(
     int weight_adjusted_idx = 0;
     switch (tensor_spec.role) {
       case TensorRole::WEIGHT:
-        weight_adjusted_idx = num_inputs;
+        assert(contains_key(this->weight_tensor_slots, op_guid));
+        tensor_guids = this->weight_tensor_slots.at(op_guid);
+        break;
       case TensorRole::INPUT:
         assert(contains_key(this->input_tensor_slots, op_guid));
         tensor_guids = this->input_tensor_slots.at(op_guid);
@@ -130,8 +128,8 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing(
     }
 
     IsGrad is_grad = slot_grad_id.is_grad;
-    GenericTensorAccessorW tensor_backing = this->get_tensor_backing(
-        tensor_guids.at(weight_adjusted_idx + tensor_spec.idx), is_grad);
+    GenericTensorAccessorW tensor_backing =
+        this->get_tensor_backing(tensor_guids.at(tensor_spec.idx), is_grad);
 
     mapping.insert({slot_grad_id, tensor_backing});
   }
diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc
index c18108d6b4..779ba43f26 100644
--- a/lib/local-execution/test/src/test_local_slots_backing.cc
+++ b/lib/local-execution/test/src/test_local_slots_backing.cc
@@ -157,11 +157,17 @@ TEST_SUITE(FF_TEST_SUITE) {
         local_slots_backing.allocate_outgoing_tensors(
             layer_guid, cg_builder.computation_graph, allocator);
         SUBCASE("Input tensor slots") {
-          std::vector<tensor_guid_t> correct_incoming_tensors =
-              get_incoming_tensors(cg_builder.computation_graph, layer_guid);
-          CHECK(correct_incoming_tensors ==
+          std::vector<tensor_guid_t> correct_incoming_input_tensors =
+              get_incoming_inputs(cg_builder.computation_graph, layer_guid);
+          CHECK(correct_incoming_input_tensors ==
                 local_slots_backing.input_tensor_slots.at(layer_guid));
         }
+        SUBCASE("Weight tensor slots") {
+          std::vector<tensor_guid_t> correct_incoming_weight_tensors =
+              get_incoming_weights(cg_builder.computation_graph, layer_guid);
+          CHECK(correct_incoming_weight_tensors ==
+                local_slots_backing.weight_tensor_slots.at(layer_guid));
+        }
         SUBCASE("Output tensor slots") {
           std::vector<tensor_guid_t> correct_outgoing_tensors =
               get_outgoing_tensors(cg_builder.computation_graph, layer_guid);

From 7470e71eaa959f2304fc5e111b18f045473c3364 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Tue, 1 Oct 2024 11:53:07 -0700
Subject: [PATCH 16/91] Fix signature to use unified rep

---
 .../local-execution/local_slots_backing.h     |  9 ++-
 .../non_graph_tensor_guid_t.struct.toml       | 17 ++++++
 .../include/local-execution/optimizer.h       |  9 +--
 .../task_signature.struct.toml                |  5 +-
 .../tensor_guid_slot_spec.struct.toml         |  5 --
 .../tensor_guid_spec.struct.toml              |  3 +-
 .../unified_tensor_guid.variant.toml          | 21 +++++++
 .../src/local_slots_backing.cc                | 56 +++++++++++--------
 .../src/local_training_backing.cc             |  5 +-
 lib/local-execution/src/loss_functions.cc     |  6 +-
 lib/local-execution/src/optimizer.cc          | 22 ++++----
 lib/local-execution/src/task_signature.cc     |  4 +-
 lib/pcg/include/pcg/computation_graph.h       |  3 -
 lib/pcg/src/pcg/computation_graph.cc          | 13 -----
 14 files changed, 105 insertions(+), 73 deletions(-)
 create mode 100644 lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml
 create mode 100644 lib/local-execution/include/local-execution/unified_tensor_guid.variant.toml

diff --git a/lib/local-execution/include/local-execution/local_slots_backing.h b/lib/local-execution/include/local-execution/local_slots_backing.h
index 93c534e583..d201d3c405 100644
--- a/lib/local-execution/include/local-execution/local_slots_backing.h
+++ b/lib/local-execution/include/local-execution/local_slots_backing.h
@@ -4,10 +4,12 @@
 
 #include "kernels/accessor.h"
 #include "local-execution/local_task_argument_accessor.h"
+#include "local-execution/non_graph_tensor_guid_t.dtg.h"
 #include "local-execution/op_task_invocation.h"
 #include "local-execution/per_device_op_state.h"
 #include "local-execution/runtime_arg_config.h"
 #include "local-execution/task_invocation.dtg.h"
+#include "local-execution/unified_tensor_guid.dtg.h"
 #include "pcg/computation_graph.dtg.h"
 #include "pcg/layer_guid_t.dtg.h"
 #include "pcg/tensor_guid_t.dtg.h"
@@ -16,6 +18,8 @@ namespace FlexFlow {
 
 using TensorBackingMap =
     std::unordered_map<tensor_guid_t, GenericTensorAccessorW>;
+using NonGraphTensorBackingMap =
+    std::unordered_map<non_graph_tensor_guid_t, GenericTensorAccessorW>;
 
 struct LocalSlotsBacking {
   LocalSlotsBacking(TensorBackingMap const &, RuntimeArgConfig const &);
@@ -42,7 +46,7 @@ struct LocalSlotsBacking {
   ConcreteArgSpec resolve_op_arg_ref_spec(OpArgRefSpec const &,
                                           layer_guid_t const &) const;
 
-  GenericTensorAccessorW const &get_tensor_backing(tensor_guid_t const &,
+  GenericTensorAccessorW const &get_tensor_backing(UnifiedTensorGuid const &,
                                                    IsGrad) const;
 
   bool is_tensor_allocated(tensor_guid_t const &) const;
@@ -52,13 +56,14 @@ struct LocalSlotsBacking {
   // tensors
   TensorBackingMap tensor_mapping;
   TensorBackingMap gradient_tensor_mapping;
+  NonGraphTensorBackingMap optimizer_tensor_mapping;
   std::unordered_map<layer_guid_t, std::vector<tensor_guid_t>>
       input_tensor_slots;
   std::unordered_map<layer_guid_t, std::vector<tensor_guid_t>>
       weight_tensor_slots;
   std::unordered_map<layer_guid_t, std::vector<tensor_guid_t>>
       output_tensor_slots;
-  std::unordered_map<tensor_guid_t, std::vector<tensor_guid_t>>
+  std::unordered_map<layer_guid_t, std::vector<non_graph_tensor_guid_t>>
       weight_optimizer_tensor_guids;
 
   // arguments
diff --git a/lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml b/lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml
new file mode 100644
index 0000000000..8904c232c9
--- /dev/null
+++ b/lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml
@@ -0,0 +1,17 @@
+namespace = "FlexFlow"
+name = "non_graph_tensor_guid_t"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "fmt",
+  "json",
+]
+
+includes = [
+  "<cstddef>",
+]
+
+[[fields]]
+name = "raw_uid"
+type = "size_t"
diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h
index 1e2cd65362..acf9b8a550 100644
--- a/lib/local-execution/include/local-execution/optimizer.h
+++ b/lib/local-execution/include/local-execution/optimizer.h
@@ -1,6 +1,7 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_
 #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_
 
+#include "local-execution/non_graph_tensor_guid_t.dtg.h"
 #include "local-execution/task_impl_function.dtg.h"
 #include "local-execution/task_invocation.dtg.h"
 #include "local-execution/task_signature.h"
@@ -14,20 +15,20 @@ TaskSignature get_update_signature(OptimizerAttrs const &);
 TaskInvocation get_update_invocation(
     OptimizerAttrs const &,
     tensor_guid_t const &weight,
-    std::vector<tensor_guid_t> const &grad_buffer_tensors);
+    std::vector<non_graph_tensor_guid_t> const &grad_buffer_tensors);
 TaskImplFunction get_update_task_impl(OptimizerAttrs const &);
 
 TaskSignature get_sgd_update_signature();
 TaskInvocation sgd_update(SGDOptimizerAttrs const &,
                           tensor_guid_t const &weight,
-                          tensor_guid_t const &sgd_v);
+                          non_graph_tensor_guid_t const &sgd_v);
 TaskImplFunction get_sgd_update_task_impl();
 
 TaskSignature get_adam_update_signature();
 TaskInvocation adam_update(AdamOptimizerAttrs const &,
                            tensor_guid_t const &weight,
-                           tensor_guid_t const &adam_v,
-                           tensor_guid_t const &adam_m);
+                           non_graph_tensor_guid_t const &adam_v,
+                           non_graph_tensor_guid_t const &adam_m);
 TaskImplFunction get_adam_update_task_impl();
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/include/local-execution/task_signature.struct.toml b/lib/local-execution/include/local-execution/task_signature.struct.toml
index fd15df91d5..ac408a7b68 100644
--- a/lib/local-execution/include/local-execution/task_signature.struct.toml
+++ b/lib/local-execution/include/local-execution/task_signature.struct.toml
@@ -8,15 +8,14 @@ features = [
 
 includes = [
   "local-execution/tensor_guid_slot_spec.dtg.h",
+  "local-execution/slot_id_t.dtg.h",
   "<typeindex>",
   "<optional>"
 ]
 
 src_includes = [
   "utils/fmt/unordered_map.h",
-  "utils/fmt/unordered_set.h",
   "utils/hash/unordered_map.h",
-  "utils/hash/unordered_set.h",
   "utils/fmt/optional.h",
   "utils/type_index.h"
 ]
@@ -31,4 +30,4 @@ type = "std::unordered_map<::FlexFlow::slot_id_t, std::type_index>"
 
 [[fields]]
 name = "tensor_guid_slots"
-type = "std::unordered_set<::FlexFlow::TensorGuidSlotSpec>"
+type = "std::unordered_map<::FlexFlow::slot_id_t, ::FlexFlow::TensorGuidSlotSpec>"
diff --git a/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml b/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml
index 4b3e5b2674..9b7e9c14f9 100644
--- a/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml
+++ b/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml
@@ -8,15 +8,10 @@ features = [
 ]
 
 includes = [
-  "local-execution/slot_id_t.dtg.h",
   "local-execution/slot_type.dtg.h",
   "local-execution/is_grad.dtg.h",
 ]
 
-[[fields]]
-name = "name"
-type = "::FlexFlow::slot_id_t"
-
 [[fields]]
 name = "slot_type"
 type = "::FlexFlow::SlotType"
diff --git a/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml b/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml
index a51d6ccf1b..1d147f60e5 100644
--- a/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml
+++ b/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml
@@ -10,11 +10,12 @@ features = [
 includes = [
   "pcg/tensor_guid_t.dtg.h",
   "local-execution/is_grad.dtg.h",
+  "local-execution/unified_tensor_guid.dtg.h"
 ]
 
 [[fields]]
 name = "tensor_guid"
-type = "::FlexFlow::tensor_guid_t"
+type = "::FlexFlow::UnifiedTensorGuid"
 
 [[fields]]
 name = "is_grad"
diff --git a/lib/local-execution/include/local-execution/unified_tensor_guid.variant.toml b/lib/local-execution/include/local-execution/unified_tensor_guid.variant.toml
new file mode 100644
index 0000000000..3d2cd8e45f
--- /dev/null
+++ b/lib/local-execution/include/local-execution/unified_tensor_guid.variant.toml
@@ -0,0 +1,21 @@
+namespace = "FlexFlow"
+name = "UnifiedTensorGuid"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "pcg/tensor_guid_t.dtg.h",
+  "local-execution/non_graph_tensor_guid_t.dtg.h",
+]
+
+[[values]]
+type = "::FlexFlow::tensor_guid_t"
+key = "tensor_guid"
+
+[[values]]
+type = "::FlexFlow::non_graph_tensor_guid_t"
+key = "non_graph_tensor_guid"
diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc
index bdbfa4f222..f10b7c0126 100644
--- a/lib/local-execution/src/local_slots_backing.cc
+++ b/lib/local-execution/src/local_slots_backing.cc
@@ -60,18 +60,19 @@ void LocalSlotsBacking::allocate_optimizer_tensors(
     Allocator &allocator,
     TaskSignature const &sig) {
   GenericTensorAccessorW weight_backing =
-      get_tensor_backing(weight, IsGrad::NO);
+      get_tensor_backing(UnifiedTensorGuid{weight}, IsGrad::NO);
   int num_grad_buffer_tensors =
       sig.tensor_guid_slots.size() - 2; // ignore 2 (weight and weight_grad)
-  std::vector<tensor_guid_t> grad_buffer_tensors =
-      get_new_tensor_guids_for_layer_without_graph_insertion(
-          cg, weight_layer, num_grad_buffer_tensors);
-  for (tensor_guid_t const &tensor_guid : grad_buffer_tensors) {
+  std::vector<non_graph_tensor_guid_t> grad_buffer_tensors;
+  for (int i = 0; i < num_grad_buffer_tensors; ++i) {
+    non_graph_tensor_guid_t buffer_tensor_guid = non_graph_tensor_guid_t{i};
     GenericTensorAccessorW buffer_backing = allocator.allocate_tensor(
         get_tensor_shape(weight_backing.shape, weight_backing.data_type));
-    this->gradient_tensor_mapping.insert({tensor_guid, buffer_backing});
+    this->optimizer_tensor_mapping.insert({buffer_tensor_guid, buffer_backing});
+    grad_buffer_tensors.push_back(buffer_tensor_guid);
   }
-  this->weight_optimizer_tensor_guids.insert({weight, grad_buffer_tensors});
+  this->weight_optimizer_tensor_guids.insert(
+      {weight_layer, grad_buffer_tensors});
 }
 
 bool LocalSlotsBacking::is_tensor_allocated(
@@ -85,18 +86,26 @@ bool LocalSlotsBacking::is_gradient_tensor_allocated(
 }
 
 GenericTensorAccessorW const &
-    LocalSlotsBacking::get_tensor_backing(tensor_guid_t const &tensor_id,
+    LocalSlotsBacking::get_tensor_backing(UnifiedTensorGuid const &tensor_id,
                                           IsGrad is_grad) const {
-  switch (is_grad) {
-    case IsGrad::NO:
-      assert(contains_key(this->tensor_mapping, tensor_id));
-      return this->tensor_mapping.at(tensor_id);
-    case IsGrad::YES:
-      assert(contains_key(this->gradient_tensor_mapping, tensor_id));
-      return this->gradient_tensor_mapping.at(tensor_id);
-    default:
-      throw mk_runtime_error(fmt::format(
-          "IsGrad should only have YES or NO, received {}", is_grad));
+  if (tensor_id.has<tensor_guid_t>()) {
+    tensor_guid_t graph_tensor_guid = tensor_id.get<tensor_guid_t>();
+    switch (is_grad) {
+      case IsGrad::NO:
+        assert(contains_key(this->tensor_mapping, graph_tensor_guid));
+        return this->tensor_mapping.at(graph_tensor_guid);
+      case IsGrad::YES:
+        assert(contains_key(this->gradient_tensor_mapping, graph_tensor_guid));
+        return this->gradient_tensor_mapping.at(graph_tensor_guid);
+      default:
+        throw mk_runtime_error(fmt::format(
+            "IsGrad should only have YES or NO, received {}", is_grad));
+    }
+  } else {
+    non_graph_tensor_guid_t non_graph_tensor_guid =
+        tensor_id.get<non_graph_tensor_guid_t>();
+    assert(contains_key(this->optimizer_tensor_mapping, non_graph_tensor_guid));
+    return this->optimizer_tensor_mapping.at(non_graph_tensor_guid);
   }
 }
 
@@ -128,8 +137,8 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing(
     }
 
     IsGrad is_grad = slot_grad_id.is_grad;
-    GenericTensorAccessorW tensor_backing =
-        this->get_tensor_backing(tensor_guids.at(tensor_spec.idx), is_grad);
+    GenericTensorAccessorW tensor_backing = this->get_tensor_backing(
+        UnifiedTensorGuid{tensor_guids.at(tensor_spec.idx)}, is_grad);
 
     mapping.insert({slot_grad_id, tensor_backing});
   }
@@ -144,8 +153,8 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing(
     SlotGradId slot_grad_id = tensor_binding.first;
     TensorGuidSpec tensor_spec = tensor_binding.second;
 
-    GenericTensorAccessorW accessor =
-        this->get_tensor_backing(tensor_spec.tensor_guid, slot_grad_id.is_grad);
+    GenericTensorAccessorW accessor = this->get_tensor_backing(
+        UnifiedTensorGuid{tensor_spec.tensor_guid}, slot_grad_id.is_grad);
     mapping.insert({slot_grad_id, accessor});
   }
 
@@ -199,7 +208,8 @@ ConcreteArgSpec LocalSlotsBacking::resolve_op_arg_ref_spec(
 
     assert(input_tensor_guids.size() > index_op_arg_ref.idx);
     GenericTensorAccessorW tensor_backing = this->get_tensor_backing(
-        input_tensor_guids.at(index_op_arg_ref.idx), IsGrad::NO);
+        UnifiedTensorGuid{input_tensor_guids.at(index_op_arg_ref.idx)},
+        IsGrad::NO);
     ParallelTensorShape shape = lift_to_parallel(
         get_tensor_shape(tensor_backing.shape, tensor_backing.data_type));
     return ConcreteArgSpec::create(shape);
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index edbb377047..dafa28a70f 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -162,9 +162,8 @@ void LocalTrainingBacking::execute_update() {
       // get tensors
       tensor_guid_t weight_tensor =
           get_only(get_outgoing_tensors(this->computation_graph, node));
-      std::vector<tensor_guid_t> grad_buffer_tensors =
-          this->local_slots_backing.weight_optimizer_tensor_guids.at(
-              weight_tensor);
+      std::vector<non_graph_tensor_guid_t> grad_buffer_tensors =
+          this->local_slots_backing.weight_optimizer_tensor_guids.at(node);
 
       // get invocation
       TaskInvocation invocation =
diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc
index 3a4c616377..a37c1d706b 100644
--- a/lib/local-execution/src/loss_functions.cc
+++ b/lib/local-execution/src/loss_functions.cc
@@ -35,9 +35,9 @@ TaskSignature get_loss_bwd_signature() {
 TaskInvocation
     backward(LossAttrs const &attrs, tensor_guid_t logit, tensor_guid_t label) {
   TaskBinding b;
-  b.bind(LOGIT, TensorGuidSpec{logit, IsGrad::NO});
-  b.bind(LABEL, TensorGuidSpec{label, IsGrad::NO});
-  b.bind(LOGIT, TensorGuidSpec{logit, IsGrad::YES});
+  b.bind(LOGIT, TensorGuidSpec{UnifiedTensorGuid{logit}, IsGrad::NO});
+  b.bind(LABEL, TensorGuidSpec{UnifiedTensorGuid{label}, IsGrad::NO});
+  b.bind(LOGIT, TensorGuidSpec{UnifiedTensorGuid{logit}, IsGrad::YES});
   b.bind_arg(ATTRS, attrs);
   b.bind_arg(PROFILING, profiling_settings());
 
diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc
index 30f20bf8ec..1e06dee96a 100644
--- a/lib/local-execution/src/optimizer.cc
+++ b/lib/local-execution/src/optimizer.cc
@@ -22,12 +22,12 @@ TaskSignature get_sgd_update_signature() {
 
 TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs,
                           tensor_guid_t const &weight,
-                          tensor_guid_t const &sgd_v) {
+                          non_graph_tensor_guid_t const &sgd_v) {
   TaskBinding b;
-  b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::YES});
-  b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::NO});
+  b.bind(WEIGHT, TensorGuidSpec{UnifiedTensorGuid{weight}, IsGrad::YES});
+  b.bind(WEIGHT, TensorGuidSpec{UnifiedTensorGuid{weight}, IsGrad::NO});
   if (attrs.momentum > 0.0f) {
-    b.bind(SGD_V, TensorGuidSpec{sgd_v, IsGrad::YES});
+    b.bind(SGD_V, TensorGuidSpec{UnifiedTensorGuid{sgd_v}, IsGrad::YES});
   }
   b.bind_arg(ATTRS, attrs);
   b.bind_arg(PROFILING, profiling_settings());
@@ -111,13 +111,13 @@ TaskSignature get_adam_update_signature() {
 
 TaskInvocation adam_update(AdamOptimizerAttrs const &attrs,
                            tensor_guid_t const &weight,
-                           tensor_guid_t const &adam_v,
-                           tensor_guid_t const &adam_m) {
+                           non_graph_tensor_guid_t const &adam_v,
+                           non_graph_tensor_guid_t const &adam_m) {
   TaskBinding b;
-  b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::YES});
-  b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::NO});
-  b.bind(ADAM_M, TensorGuidSpec{adam_m, IsGrad::YES});
-  b.bind(ADAM_V, TensorGuidSpec{adam_v, IsGrad::YES});
+  b.bind(WEIGHT, TensorGuidSpec{UnifiedTensorGuid{weight}, IsGrad::YES});
+  b.bind(WEIGHT, TensorGuidSpec{UnifiedTensorGuid{weight}, IsGrad::NO});
+  b.bind(ADAM_M, TensorGuidSpec{UnifiedTensorGuid{adam_m}, IsGrad::YES});
+  b.bind(ADAM_V, TensorGuidSpec{UnifiedTensorGuid{adam_v}, IsGrad::YES});
   b.bind_arg(ATTRS, attrs);
   b.bind_arg(PROFILING, profiling_settings());
 
@@ -192,7 +192,7 @@ TaskSignature get_update_signature(OptimizerAttrs const &attrs) {
 TaskInvocation get_update_invocation(
     OptimizerAttrs const &attrs,
     tensor_guid_t const &weight,
-    std::vector<tensor_guid_t> const &grad_buffer_tensors) {
+    std::vector<non_graph_tensor_guid_t> const &grad_buffer_tensors) {
   return attrs.visit<TaskInvocation>(overload{
       [&](SGDOptimizerAttrs const &s) {
         return sgd_update(s, weight, grad_buffer_tensors.at(0));
diff --git a/lib/local-execution/src/task_signature.cc b/lib/local-execution/src/task_signature.cc
index 3bba9e2c8a..27bcbcd266 100644
--- a/lib/local-execution/src/task_signature.cc
+++ b/lib/local-execution/src/task_signature.cc
@@ -18,8 +18,8 @@ void add_slot(TaskSignature &task_signature,
               IsGrad is_grad,
               SlotType slot_type) {
   TensorGuidSlotSpec tensor_guid_slot_spec =
-      TensorGuidSlotSpec{name, slot_type, is_grad};
-  task_signature.tensor_guid_slots.insert(tensor_guid_slot_spec);
+      TensorGuidSlotSpec{slot_type, is_grad};
+  task_signature.tensor_guid_slots.insert({name, tensor_guid_slot_spec});
 }
 
 } // namespace FlexFlow
diff --git a/lib/pcg/include/pcg/computation_graph.h b/lib/pcg/include/pcg/computation_graph.h
index 32ed0e3025..f70d9f7404 100644
--- a/lib/pcg/include/pcg/computation_graph.h
+++ b/lib/pcg/include/pcg/computation_graph.h
@@ -52,9 +52,6 @@ LayerAttrs get_layer_attrs(ComputationGraph const &cg, layer_guid_t const &n);
 layer_guid_t get_layer_by_name(ComputationGraph const &cg,
                                std::string const &name);
 
-std::vector<tensor_guid_t>
-    get_new_tensor_guids_for_layer_without_graph_insertion(
-        ComputationGraph const &, layer_guid_t const &n, int num_tensors);
 std::string as_dot(ComputationGraph const &);
 void debug_print_dot(ComputationGraph const &);
 
diff --git a/lib/pcg/src/pcg/computation_graph.cc b/lib/pcg/src/pcg/computation_graph.cc
index 6f6c10d798..a69e54fd93 100644
--- a/lib/pcg/src/pcg/computation_graph.cc
+++ b/lib/pcg/src/pcg/computation_graph.cc
@@ -175,19 +175,6 @@ layer_guid_t get_layer_by_name(ComputationGraph const &cg,
   return get_only(found);
 }
 
-std::vector<tensor_guid_t>
-    get_new_tensor_guids_for_layer_without_graph_insertion(
-        ComputationGraph const &cg, layer_guid_t const &n, int num_tensors) {
-  std::vector<tensor_guid_t> new_tensor_guids;
-  int num_outgoing_tensors = get_outgoing_tensors(cg, n).size();
-
-  for (int i = 0; i < num_tensors; ++i) {
-    new_tensor_guids.push_back(
-        tensor_guid_t{DataflowOutput{n.raw_node, num_outgoing_tensors + i}});
-  }
-  return new_tensor_guids;
-}
-
 std::string as_dot(ComputationGraph const &cg) {
   std::function<std::string(LayerAttrs const &)> get_node_label =
       [](LayerAttrs const &a) -> std::string {

From deece1be7eae96ef4604679a13c2ec58207632e3 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Tue, 1 Oct 2024 12:39:29 -0700
Subject: [PATCH 17/91] Fix model training instance abstraction

---
 .../local-execution/local_training_backing.h  |  5 +++-
 .../local-execution/model_training_instance.h | 13 --------
 .../model_training_instance.struct.toml       |  7 +----
 .../non_graph_tensor_guid_t.struct.toml       |  6 +---
 .../src/local_cost_estimator.cc               |  4 ++-
 .../src/local_training_backing.cc             | 22 +++++++-------
 .../src/model_training_instance.cc            | 30 -------------------
 lib/local-execution/test/src/test_loss_e2e.cc | 24 +++++++--------
 .../test/src/test_update_e2e.cc               | 18 +++++------
 lib/pcg/include/pcg/optimizer_attrs.h         | 13 ++++++++
 lib/pcg/src/pcg/optimizer_attrs.cc            | 24 +++++++++++++++
 11 files changed, 79 insertions(+), 87 deletions(-)
 delete mode 100644 lib/local-execution/include/local-execution/model_training_instance.h
 delete mode 100644 lib/local-execution/src/model_training_instance.cc
 create mode 100644 lib/pcg/include/pcg/optimizer_attrs.h
 create mode 100644 lib/pcg/src/pcg/optimizer_attrs.cc

diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h
index 08a458cb7f..2313d55732 100644
--- a/lib/local-execution/include/local-execution/local_training_backing.h
+++ b/lib/local-execution/include/local-execution/local_training_backing.h
@@ -5,6 +5,7 @@
 #include "local-execution/model_training_instance.dtg.h"
 #include "local-execution/task_registry.h"
 #include "pcg/computation_graph.dtg.h"
+#include "pcg/optimizer_attrs.dtg.h"
 
 namespace FlexFlow {
 
@@ -16,7 +17,8 @@ struct LocalTrainingBacking {
                        ComputationGraph const &,
                        TensorBackingMap const &,
                        RuntimeArgConfig const &,
-                       std::optional<ModelTrainingInstance> &);
+                       std::optional<ModelTrainingInstance> const &,
+                       std::optional<OptimizerAttrs> const &);
 
   void execute_init();
   PerLayerElapsedTime execute_forward();
@@ -38,6 +40,7 @@ struct LocalTrainingBacking {
   TaskRegistry task_registry;
   LocalSlotsBacking local_slots_backing;
   std::optional<ModelTrainingInstance> training_instance;
+  std::optional<OptimizerAttrs> optimizer_attrs;
 };
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h
deleted file mode 100644
index afc8fa7472..0000000000
--- a/lib/local-execution/include/local-execution/model_training_instance.h
+++ /dev/null
@@ -1,13 +0,0 @@
-
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_MODEL_TRAINING_INSTANCE_H
-#define _FLEXFLOW_LOCAL_EXECUTION_MODEL_TRAINING_INSTANCE_H
-
-#include "local-execution/model_training_instance.dtg.h"
-
-namespace FlexFlow {
-
-ModelTrainingInstance next(ModelTrainingInstance const &old);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/model_training_instance.struct.toml b/lib/local-execution/include/local-execution/model_training_instance.struct.toml
index 28282e21c0..dcfaf2175d 100644
--- a/lib/local-execution/include/local-execution/model_training_instance.struct.toml
+++ b/lib/local-execution/include/local-execution/model_training_instance.struct.toml
@@ -9,8 +9,7 @@ features = [
 
 includes = [
   "op-attrs/ops/loss_functions/loss_attrs.dtg.h",
-  "pcg/tensor_guid_t.dtg.h",
-  "pcg/optimizer_attrs.dtg.h",
+  "pcg/tensor_guid_t.dtg.h"
 ]
 
 [[fields]]
@@ -24,7 +23,3 @@ type = "::FlexFlow::tensor_guid_t"
 [[fields]]
 name = "logit_tensor"
 type = "::FlexFlow::tensor_guid_t"
-
-[[fields]]
-name = "optimizer_attrs"
-type = "::FlexFlow::OptimizerAttrs"
diff --git a/lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml b/lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml
index 8904c232c9..4832ecaafa 100644
--- a/lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml
+++ b/lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml
@@ -8,10 +8,6 @@ features = [
   "json",
 ]
 
-includes = [
-  "<cstddef>",
-]
-
 [[fields]]
 name = "raw_uid"
-type = "size_t"
+type = "int"
diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
index f153db3240..186c2d516a 100644
--- a/lib/local-execution/src/local_cost_estimator.cc
+++ b/lib/local-execution/src/local_cost_estimator.cc
@@ -76,11 +76,13 @@ CostDetails LocalCostEstimator::estimate_cost(
                            get_vector_piece_attrs(outputs));
 
   std::optional<ModelTrainingInstance> model_training_instance = std::nullopt;
+  std::optional<OptimizerAttrs> optimizer_attrs = std::nullopt;
   LocalTrainingBacking local_backing(allocator,
                                      cg_builder.computation_graph,
                                      tensor_backing_map,
                                      this->runtime_arg_config,
-                                     model_training_instance);
+                                     model_training_instance,
+                                     optimizer_attrs);
 
   local_backing.execute_init();
   PerLayerElapsedTime fwd = local_backing.execute_forward();
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index dafa28a70f..46a8f83709 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -1,10 +1,10 @@
 #include "local-execution/local_training_backing.h"
 #include "local-execution/loss_functions.h"
-#include "local-execution/model_training_instance.h"
 #include "local-execution/optimizer.h"
 #include "local-execution/task_invocation.h"
 #include "local-execution/task_signature_impl.h"
 #include "pcg/computation_graph.h"
+#include "pcg/optimizer_attrs.h"
 #include "utils/containers/contains.h"
 #include "utils/containers/contains_key.h"
 #include "utils/containers/get_only.h"
@@ -18,11 +18,12 @@ LocalTrainingBacking::LocalTrainingBacking(
     ComputationGraph const &computation_graph,
     TensorBackingMap const &tensor_backing_mapping,
     RuntimeArgConfig const &runtime_arg_config,
-    std::optional<ModelTrainingInstance> &training_instance)
+    std::optional<ModelTrainingInstance> const &training_instance,
+    std::optional<OptimizerAttrs> const &optimizer_attrs)
     : allocator(allocator), computation_graph(computation_graph),
       local_slots_backing(tensor_backing_mapping, runtime_arg_config),
       task_registry(empty_task_registry()),
-      training_instance(training_instance) {
+      training_instance(training_instance), optimizer_attrs(optimizer_attrs) {
 
   for (layer_guid_t const &node :
        topological_ordering(this->computation_graph)) {
@@ -38,8 +39,8 @@ LocalTrainingBacking::LocalTrainingBacking(
 
     // allocate optimizer buffers
     if (attrs.has<WeightAttrs>() && this->training_instance.has_value()) {
-      OptimizerAttrs attrs = this->training_instance.value().optimizer_attrs;
-      TaskSignature sig = get_update_signature(attrs);
+      assert(this->optimizer_attrs.has_value());
+      TaskSignature sig = get_update_signature(this->optimizer_attrs.value());
       tensor_guid_t weight_tensor =
           get_only(get_outgoing_tensors(this->computation_graph, node));
       this->local_slots_backing.allocate_optimizer_tensors(
@@ -153,7 +154,7 @@ PerLayerElapsedTime LocalTrainingBacking::execute_backward() {
 
 void LocalTrainingBacking::execute_update() {
   assert(this->training_instance.has_value());
-  OptimizerAttrs attrs = this->training_instance.value().optimizer_attrs;
+  assert(this->optimizer_attrs.has_value());
 
   for (layer_guid_t const &node :
        topological_ordering(this->computation_graph)) {
@@ -166,18 +167,19 @@ void LocalTrainingBacking::execute_update() {
           this->local_slots_backing.weight_optimizer_tensor_guids.at(node);
 
       // get invocation
-      TaskInvocation invocation =
-          get_update_invocation(attrs, weight_tensor, grad_buffer_tensors);
+      TaskInvocation invocation = get_update_invocation(
+          this->optimizer_attrs.value(), weight_tensor, grad_buffer_tensors);
       // assert(is_invocation_valid(get_update_signature(attrs), invocation));
 
       // execute update
       TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation);
-      TaskImplFunction update_impl_fn = get_update_task_impl(attrs);
+      TaskImplFunction update_impl_fn =
+          get_update_task_impl(this->optimizer_attrs.value());
       update_impl_fn.get<GenericTaskImplFunction>().function_ptr(accessor);
     }
   }
 
-  this->training_instance = next(this->training_instance.value());
+  this->optimizer_attrs = next(this->optimizer_attrs.value());
 }
 
 TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor(
diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc
deleted file mode 100644
index c626bfc0e0..0000000000
--- a/lib/local-execution/src/model_training_instance.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-#include "local-execution/model_training_instance.h"
-
-namespace FlexFlow {
-
-ModelTrainingInstance next(ModelTrainingInstance const &old_training_instance) {
-  if (old_training_instance.optimizer_attrs.has<AdamOptimizerAttrs>()) {
-    AdamOptimizerAttrs old =
-        old_training_instance.optimizer_attrs.get<AdamOptimizerAttrs>();
-    double new_beta1_t = old.beta_t * old.beta1;
-    double new_beta2_t = old.beta2_t * old.beta2;
-    double new_alpha_t = old.alpha * sqrt(1 - new_beta2_t) / (1 - new_beta1_t);
-    OptimizerAttrs new_attrs =
-        OptimizerAttrs{AdamOptimizerAttrs{old.alpha,
-                                          old.beta1,
-                                          old.beta2,
-                                          old.weight_decay,
-                                          new_alpha_t,
-                                          new_beta1_t,
-                                          new_beta2_t,
-                                          old.epsilon}};
-    return ModelTrainingInstance{old_training_instance.loss_attrs,
-                                 old_training_instance.label_tensor,
-                                 old_training_instance.logit_tensor,
-                                 new_attrs};
-  } else {
-    return old_training_instance;
-  }
-}
-
-} // namespace FlexFlow
diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc
index 4801aff6a9..72df1a08f1 100644
--- a/lib/local-execution/test/src/test_loss_e2e.cc
+++ b/lib/local-execution/test/src/test_loss_e2e.cc
@@ -59,13 +59,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
               LossAttrs{SparseCategoricalCrossEntropyLossAttrs{
                   /*replace_labels=*/false}},
               label_tensor,
-              logit_tensor,
-              optimizer_attrs};
+              logit_tensor};
       LocalTrainingBacking local_backing(allocator,
                                          cg_builder.computation_graph,
                                          tensor_backing_map,
                                          runtime_arg_config,
-                                         model_training_instance);
+                                         model_training_instance,
+                                         optimizer_attrs);
       local_backing.execute_init();
       local_backing.execute_forward();
       local_backing.execute_backward();
@@ -83,13 +83,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
             ModelTrainingInstance{LossAttrs{NonconfigurableLossAttrs{
                                       LossFunction::CATEGORICAL_CROSSENTROPY}},
                                   label_tensor,
-                                  logit_tensor,
-                                  optimizer_attrs};
+                                  logit_tensor};
         LocalTrainingBacking local_backing(allocator,
                                            cg_builder.computation_graph,
                                            tensor_backing_map,
                                            runtime_arg_config,
-                                           model_training_instance);
+                                           model_training_instance,
+                                           optimizer_attrs);
         local_backing.execute_init();
         local_backing.execute_forward();
         local_backing.execute_backward();
@@ -101,13 +101,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
                 LossAttrs{NonconfigurableLossAttrs{
                     LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}},
                 label_tensor,
-                logit_tensor,
-                optimizer_attrs};
+                logit_tensor};
         LocalTrainingBacking local_backing(allocator,
                                            cg_builder.computation_graph,
                                            tensor_backing_map,
                                            runtime_arg_config,
-                                           model_training_instance);
+                                           model_training_instance,
+                                           optimizer_attrs);
         local_backing.execute_init();
         local_backing.execute_forward();
         local_backing.execute_backward();
@@ -118,13 +118,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
             ModelTrainingInstance{
                 LossAttrs{NonconfigurableLossAttrs{LossFunction::IDENTITY}},
                 label_tensor,
-                logit_tensor,
-                optimizer_attrs};
+                logit_tensor};
         LocalTrainingBacking local_backing(allocator,
                                            cg_builder.computation_graph,
                                            tensor_backing_map,
                                            runtime_arg_config,
-                                           model_training_instance);
+                                           model_training_instance,
+                                           optimizer_attrs);
         local_backing.execute_init();
         local_backing.execute_forward();
         local_backing.execute_backward();
diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc
index af4303fab8..96b748806f 100644
--- a/lib/local-execution/test/src/test_update_e2e.cc
+++ b/lib/local-execution/test/src/test_update_e2e.cc
@@ -58,13 +58,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
                 LossAttrs{NonconfigurableLossAttrs{
                     LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}},
                 label_tensor,
-                logit_tensor,
-                optimizer_attrs};
+                logit_tensor};
         LocalTrainingBacking local_backing(allocator,
                                            cg_builder.computation_graph,
                                            tensor_backing_map,
                                            runtime_arg_config,
-                                           model_training_instance);
+                                           model_training_instance,
+                                           optimizer_attrs);
         local_backing.execute_init();
         local_backing.execute_forward();
         local_backing.execute_backward();
@@ -81,13 +81,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
                 LossAttrs{NonconfigurableLossAttrs{
                     LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}},
                 label_tensor,
-                logit_tensor,
-                optimizer_attrs};
+                logit_tensor};
         LocalTrainingBacking local_backing(allocator,
                                            cg_builder.computation_graph,
                                            tensor_backing_map,
                                            runtime_arg_config,
-                                           model_training_instance);
+                                           model_training_instance,
+                                           optimizer_attrs);
         local_backing.execute_init();
         local_backing.execute_forward();
         local_backing.execute_backward();
@@ -109,13 +109,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
               LossAttrs{NonconfigurableLossAttrs{
                   LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}},
               label_tensor,
-              logit_tensor,
-              optimizer_attrs};
+              logit_tensor};
       LocalTrainingBacking local_backing(allocator,
                                          cg_builder.computation_graph,
                                          tensor_backing_map,
                                          runtime_arg_config,
-                                         model_training_instance);
+                                         model_training_instance,
+                                         optimizer_attrs);
       local_backing.execute_init();
       local_backing.execute_forward();
       local_backing.execute_backward();
diff --git a/lib/pcg/include/pcg/optimizer_attrs.h b/lib/pcg/include/pcg/optimizer_attrs.h
new file mode 100644
index 0000000000..4b78f66fe4
--- /dev/null
+++ b/lib/pcg/include/pcg/optimizer_attrs.h
@@ -0,0 +1,13 @@
+
+#ifndef _FLEXFLOW_PCG_OPTIMIZER_ATTRS_H
+#define _FLEXFLOW_PCG_OPTIMIZER_ATTRS_H
+
+#include "pcg/optimizer_attrs.dtg.h"
+
+namespace FlexFlow {
+
+OptimizerAttrs next(OptimizerAttrs const &old);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/pcg/src/pcg/optimizer_attrs.cc b/lib/pcg/src/pcg/optimizer_attrs.cc
new file mode 100644
index 0000000000..5307450a68
--- /dev/null
+++ b/lib/pcg/src/pcg/optimizer_attrs.cc
@@ -0,0 +1,24 @@
+#include "pcg/optimizer_attrs.h"
+
+namespace FlexFlow {
+
+OptimizerAttrs next(OptimizerAttrs const &old_attrs) {
+  if (old_attrs.has<AdamOptimizerAttrs>()) {
+    AdamOptimizerAttrs old = old_attrs.get<AdamOptimizerAttrs>();
+    double new_beta1_t = old.beta_t * old.beta1;
+    double new_beta2_t = old.beta2_t * old.beta2;
+    double new_alpha_t = old.alpha * sqrt(1 - new_beta2_t) / (1 - new_beta1_t);
+    return OptimizerAttrs{AdamOptimizerAttrs{old.alpha,
+                                             old.beta1,
+                                             old.beta2,
+                                             old.weight_decay,
+                                             new_alpha_t,
+                                             new_beta1_t,
+                                             new_beta2_t,
+                                             old.epsilon}};
+  } else {
+    return old_attrs;
+  }
+}
+
+} // namespace FlexFlow

From 1d3cc9498fb5afe5e9f1b0aa1e50260a58e1c424 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Tue, 1 Oct 2024 13:10:28 -0700
Subject: [PATCH 18/91] Change subcase test name

---
 lib/local-execution/test/src/test_loss_e2e.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc
index 72df1a08f1..37024adc26 100644
--- a/lib/local-execution/test/src/test_loss_e2e.cc
+++ b/lib/local-execution/test/src/test_loss_e2e.cc
@@ -71,7 +71,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       local_backing.execute_backward();
     }
 
-    SUBCASE("OtherAttrs") {
+    SUBCASE("NonconfigurableLossAttrs") {
       tensor_guid_t label_tensor =
           cg_builder.create_input(input_shape, CreateGrad::NO);
       GenericTensorAccessorW label_backing =

From 3cf5d08fb3b56f0e70145179c5dfd72eacd3cc2e Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Wed, 16 Oct 2024 12:34:59 -0700
Subject: [PATCH 19/91] Quick fixes

---
 lib/kernels/include/kernels/array_shape.h     |  8 ++++----
 lib/kernels/include/kernels/legion_dim.h      |  3 +++
 lib/kernels/src/allocation.cc                 |  2 +-
 lib/kernels/src/array_shape.cc                |  7 +++++--
 lib/kernels/src/legion_dim.cc                 |  9 +++++++++
 .../include/local-execution/arg_ref.h         | 17 ++++++++++++++--
 .../include/local-execution/concrete_arg.h    | 14 +++++++++++++
 .../include/local-execution/runtime_arg_ref.h | 16 +++++++++++++++
 .../task_arg_spec.variant.toml                |  4 +++-
 .../include/local-execution/task_binding.h    | 18 +++++++++++++++++
 .../task_invocation.struct.toml               |  4 +++-
 .../src/local_training_backing.cc             |  4 ++--
 lib/local-execution/src/ops/element_unary.cc  |  6 ++++--
 lib/local-execution/src/runtime_arg_ref.cc    | 13 ++++++++++++
 lib/local-execution/src/task_binding.cc       | 13 ++++++++++++
 .../test/src/test_local_cost_estimator.cc     | 20 ++++++++++++-------
 lib/pcg/include/pcg/optimizer_attrs.h         |  2 +-
 .../parallel_computation_graph_builder.h      |  4 ++++
 lib/pcg/src/pcg/optimizer_attrs.cc            |  3 ++-
 .../parallel_computation_graph_builder.cc     |  2 +-
 20 files changed, 144 insertions(+), 25 deletions(-)

diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h
index e60f0cd9c1..fd66697793 100644
--- a/lib/kernels/include/kernels/array_shape.h
+++ b/lib/kernels/include/kernels/array_shape.h
@@ -14,10 +14,10 @@ namespace FlexFlow {
 struct ArrayShape {
 public:
   ArrayShape() = delete;
-  ArrayShape(size_t *dims, size_t num_dims);
-  ArrayShape(TensorShape const &shape);
-  ArrayShape(std::vector<std::size_t> const &);
-  ArrayShape(LegionTensorDims const &);
+  explicit ArrayShape(size_t *dims, size_t num_dims);
+  explicit ArrayShape(TensorShape const &shape);
+  explicit ArrayShape(std::vector<std::size_t> const &);
+  explicit ArrayShape(LegionTensorDims const &);
 
   /**
    * @brief Alias of ArrayShape::num_elements for compatibility with
diff --git a/lib/kernels/include/kernels/legion_dim.h b/lib/kernels/include/kernels/legion_dim.h
index e4dd9723b8..29c5e29a93 100644
--- a/lib/kernels/include/kernels/legion_dim.h
+++ b/lib/kernels/include/kernels/legion_dim.h
@@ -10,6 +10,9 @@ legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value);
 
 legion_dim_t legion_dim_from_ff_dim(ff_dim_t, int num_dimensions);
 
+std::optional<legion_dim_t> legion_dim_from_ff_dim(std::optional<ff_dim_t>,
+                                                   int num_dimensions);
+
 template <typename T>
 using LegionOrdered = DimOrdered<legion_dim_t, T>;
 
diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc
index ccd88580db..b57fbee257 100644
--- a/lib/kernels/src/allocation.cc
+++ b/lib/kernels/src/allocation.cc
@@ -14,7 +14,7 @@ void Allocator::deallocate(void *ptr) {
 GenericTensorAccessorW
     Allocator::allocate_tensor(TensorShape const &tensor_shape) {
   void *ptr = this->allocate(get_size_in_bytes(tensor_shape));
-  return {tensor_shape.data_type, tensor_shape, ptr};
+  return {tensor_shape.data_type, ArrayShape{tensor_shape}, ptr};
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
index 8464212290..31ee7b6001 100644
--- a/lib/kernels/src/array_shape.cc
+++ b/lib/kernels/src/array_shape.cc
@@ -61,8 +61,11 @@ ArrayShape ArrayShape::sub_shape(legion_dim_t start, ff_dim_t end) const {
 
 ArrayShape ArrayShape::sub_shape(std::optional<ff_dim_t> start,
                                  std::optional<ff_dim_t> end) const {
-  return ArrayShape{legion_dims_from_ff_dims(
-      slice(ff_ordered_from_legion_ordered(this->dims), start, end))};
+  std::optional<legion_dim_t> legion_start =
+      legion_dim_from_ff_dim(start, num_dims());
+  std::optional<legion_dim_t> legion_end =
+      legion_dim_from_ff_dim(end, num_dims());
+  return this->sub_shape(legion_start, legion_end);
 }
 
 ArrayShape ArrayShape::sub_shape(std::optional<legion_dim_t> start,
diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/legion_dim.cc
index 9ef47d40ae..c190a02220 100644
--- a/lib/kernels/src/legion_dim.cc
+++ b/lib/kernels/src/legion_dim.cc
@@ -10,4 +10,13 @@ legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, int num_dimensions) {
   return legion_dim_t(num_dimensions - ff_dim.value - 1);
 }
 
+std::optional<legion_dim_t>
+    legion_dim_from_ff_dim(std::optional<ff_dim_t> ff_dim, int num_dimensions) {
+  if (ff_dim.has_value()) {
+    return legion_dim_from_ff_dim(ff_dim.value(), num_dimensions);
+  } else {
+    return std::nullopt;
+  }
+}
+
 } // namespace FlexFlow
diff --git a/lib/local-execution/include/local-execution/arg_ref.h b/lib/local-execution/include/local-execution/arg_ref.h
index 30326b0e84..30da405c13 100644
--- a/lib/local-execution/include/local-execution/arg_ref.h
+++ b/lib/local-execution/include/local-execution/arg_ref.h
@@ -60,6 +60,20 @@ struct ArgRefSpec {
   friend struct std::hash<ArgRefSpec<LABEL_TYPE>>;
 };
 
+template <typename T>
+std::string format_as(ArgRefSpec<T> const &x) {
+  std::ostringstream oss;
+  oss << "<ArgRefSpec";
+  oss << " type_idx=" << x.get_type_index().name();
+  oss << ">";
+  return oss.str();
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &s, ArgRefSpec<T> const &x) {
+  return (s << fmt::to_string(x));
+}
+
 } // namespace FlexFlow
 
 namespace std {
@@ -68,8 +82,7 @@ template <typename LABEL_TYPE>
 struct hash<::FlexFlow::ArgRefSpec<LABEL_TYPE>> {
   size_t operator()(::FlexFlow::ArgRefSpec<LABEL_TYPE> const &s) const {
     size_t result = 0;
-    hash_combine(s.type_idx);
-    hash_combine(s.ref_type);
+    ::FlexFlow::hash_combine(result, s.type_idx);
     return result;
   }
 };
diff --git a/lib/local-execution/include/local-execution/concrete_arg.h b/lib/local-execution/include/local-execution/concrete_arg.h
index 3bc2714a71..ac5d97f3c4 100644
--- a/lib/local-execution/include/local-execution/concrete_arg.h
+++ b/lib/local-execution/include/local-execution/concrete_arg.h
@@ -3,6 +3,7 @@
 
 #include "fmt/format.h"
 #include "local-execution/serialization.h"
+#include "utils/hash-utils.h"
 #include "utils/type_index.h"
 #include <memory>
 
@@ -53,4 +54,17 @@ std::ostream &operator<<(std::ostream &, ConcreteArgSpec const &);
 
 } // namespace FlexFlow
 
+namespace std {
+
+template <>
+struct hash<::FlexFlow::ConcreteArgSpec> {
+  size_t operator()(::FlexFlow::ConcreteArgSpec const &s) const {
+    size_t result = 0;
+    ::FlexFlow::hash_combine(result, s.get_type_index());
+    return result;
+  }
+};
+
+} // namespace std
+
 #endif
diff --git a/lib/local-execution/include/local-execution/runtime_arg_ref.h b/lib/local-execution/include/local-execution/runtime_arg_ref.h
index 279d854a27..fd79e23126 100644
--- a/lib/local-execution/include/local-execution/runtime_arg_ref.h
+++ b/lib/local-execution/include/local-execution/runtime_arg_ref.h
@@ -5,6 +5,8 @@
 #include "local-execution/config.h"
 #include "local-execution/device_specific.h"
 #include "local-execution/profiling.h"
+#include "utils/fmt.h"
+#include "utils/type_index.h"
 
 namespace FlexFlow {
 
@@ -14,6 +16,8 @@ enum class RuntimeArgRefType {
   FF_ITERATION_CONFIG
 };
 
+std::string to_string(RuntimeArgRefType const &);
+
 template <typename T>
 using RuntimeArgRef = ArgRef<RuntimeArgRefType, T>;
 
@@ -23,6 +27,18 @@ RuntimeArgRef<ProfilingSettings> profiling_settings();
 RuntimeArgRef<DeviceSpecific<PerDeviceFFHandle>> ff_handle();
 RuntimeArgRef<FFIterationConfig> iteration_config();
 
+// std::string format_as(RuntimeArgRefSpec const & x) {
+//   std::ostringstream oss;
+//   oss << "<RuntimeArgRefSpec";
+//   oss << " type_idx=" << x.get_type_index().name();
+//   oss << ">";
+//   return oss.str();
+// }
+
+// std::ostream &operator<<(std::ostream & s, RuntimeArgRefSpec const & x) {
+//   return (s << fmt::to_string(x));
+// }
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/local-execution/include/local-execution/task_arg_spec.variant.toml b/lib/local-execution/include/local-execution/task_arg_spec.variant.toml
index a6df0c8a7d..271e3b73d6 100644
--- a/lib/local-execution/include/local-execution/task_arg_spec.variant.toml
+++ b/lib/local-execution/include/local-execution/task_arg_spec.variant.toml
@@ -1,7 +1,9 @@
 namespace = "FlexFlow"
 name = "TaskArgSpec"
 features = [
-  "eq"
+  "eq",
+  "fmt",
+  "hash"
 ]
 
 includes = [
diff --git a/lib/local-execution/include/local-execution/task_binding.h b/lib/local-execution/include/local-execution/task_binding.h
index cbe210f438..96c96473e4 100644
--- a/lib/local-execution/include/local-execution/task_binding.h
+++ b/lib/local-execution/include/local-execution/task_binding.h
@@ -7,6 +7,7 @@
 #include "local-execution/task_id_t.dtg.h"
 #include "local-execution/task_signature.dtg.h"
 #include "local-execution/tensor_guid_spec.dtg.h"
+#include "utils/hash/unordered_map.h"
 
 namespace FlexFlow {
 
@@ -53,6 +54,23 @@ struct TaskBinding {
       tie() const;
 };
 
+std::string format_as(TaskBinding const &x);
+std::ostream &operator<<(std::ostream &s, TaskBinding const &x);
+
 } // namespace FlexFlow
 
+namespace std {
+
+template <>
+struct hash<::FlexFlow::TaskBinding> {
+  size_t operator()(::FlexFlow::TaskBinding const &s) const {
+    size_t result = 0;
+    hash_combine(result, s.get_tensor_bindings());
+    hash_combine(result, s.get_arg_bindings());
+    return result;
+  }
+};
+
+} // namespace std
+
 #endif
diff --git a/lib/local-execution/include/local-execution/task_invocation.struct.toml b/lib/local-execution/include/local-execution/task_invocation.struct.toml
index abcaabda93..c9e1e22ba1 100644
--- a/lib/local-execution/include/local-execution/task_invocation.struct.toml
+++ b/lib/local-execution/include/local-execution/task_invocation.struct.toml
@@ -1,7 +1,9 @@
 namespace = "FlexFlow"
 name = "TaskInvocation"
 features = [
-  "eq"
+  "eq",
+  "fmt",
+  "hash"
 ]
 
 includes = [
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index 46a8f83709..b7631470b7 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -39,7 +39,6 @@ LocalTrainingBacking::LocalTrainingBacking(
 
     // allocate optimizer buffers
     if (attrs.has<WeightAttrs>() && this->training_instance.has_value()) {
-      assert(this->optimizer_attrs.has_value());
       TaskSignature sig = get_update_signature(this->optimizer_attrs.value());
       tensor_guid_t weight_tensor =
           get_only(get_outgoing_tensors(this->computation_graph, node));
@@ -179,7 +178,8 @@ void LocalTrainingBacking::execute_update() {
     }
   }
 
-  this->optimizer_attrs = next(this->optimizer_attrs.value());
+  this->optimizer_attrs =
+      get_next_iteration_optimizer_attrs(this->optimizer_attrs.value());
 }
 
 TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor(
diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc
index dbbfad10fb..ccb41d7461 100644
--- a/lib/local-execution/src/ops/element_unary.cc
+++ b/lib/local-execution/src/ops/element_unary.cc
@@ -61,8 +61,10 @@ static DeviceSpecificDeviceStates
 
   ParallelTensorShape output_shape =
       throw_if_unexpected(get_output_shape(attrs, input_shape));
-  ElementUnaryPerDeviceState per_device_state = init_kernel(
-      get_piece_shape(input_shape), get_piece_shape(output_shape), attrs);
+  ElementUnaryPerDeviceState per_device_state =
+      init_kernel(ArrayShape{get_piece_shape(input_shape)},
+                  ArrayShape{get_piece_shape(output_shape)},
+                  attrs);
 
   return DeviceSpecificDeviceStates{
       DeviceSpecific<ElementUnaryPerDeviceState>::create(per_device_state)};
diff --git a/lib/local-execution/src/runtime_arg_ref.cc b/lib/local-execution/src/runtime_arg_ref.cc
index 56201a5c55..1f591b4d82 100644
--- a/lib/local-execution/src/runtime_arg_ref.cc
+++ b/lib/local-execution/src/runtime_arg_ref.cc
@@ -3,6 +3,19 @@
 
 namespace FlexFlow {
 
+std::string to_string(RuntimeArgRefType const &runtime_arg_ref_type) {
+  switch (runtime_arg_ref_type) {
+    case RuntimeArgRefType::FF_HANDLE:
+      return "FF_HANDLE";
+    case RuntimeArgRefType::PROFILING_SETTINGS:
+      return "PROFILING_SETTINGS";
+    case RuntimeArgRefType::FF_ITERATION_CONFIG:
+      return "FF_ITERATION_CONFIG";
+    default:
+      return "Unknown";
+  }
+}
+
 RuntimeArgRef<ProfilingSettings> profiling_settings() {
   return {RuntimeArgRefType::PROFILING_SETTINGS};
 }
diff --git a/lib/local-execution/src/task_binding.cc b/lib/local-execution/src/task_binding.cc
index a5a3b2dc34..45d9d0cdb9 100644
--- a/lib/local-execution/src/task_binding.cc
+++ b/lib/local-execution/src/task_binding.cc
@@ -1,5 +1,6 @@
 #include "local-execution/task_binding.h"
 #include "utils/containers/contains_key.h"
+#include "utils/fmt/unordered_map.h"
 
 namespace FlexFlow {
 
@@ -41,4 +42,16 @@ std::unordered_map<slot_id_t, TaskArgSpec> const &
   return this->arg_bindings;
 }
 
+std::string format_as(TaskBinding const &x) {
+  std::ostringstream oss;
+  oss << "<TaskBinding";
+  oss << " tensor_bindings=" << x.get_tensor_bindings();
+  oss << " arg_bindings=" << x.get_arg_bindings();
+  return oss.str();
+}
+
+std::ostream &operator<<(std::ostream &s, TaskBinding const &x) {
+  return (s << fmt::to_string(x));
+}
+
 } // namespace FlexFlow
diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc
index 2b22d64969..6ff551828f 100644
--- a/lib/local-execution/test/src/test_local_cost_estimator.cc
+++ b/lib/local-execution/test/src/test_local_cost_estimator.cc
@@ -5,6 +5,7 @@
 #include "op-attrs/ops/attention.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "pcg/computation_graph_builder.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
@@ -31,7 +32,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
           /*kdim=*/embed_dim,
           /*vdim=*/embed_dim,
           /*dropout=*/0.0,
-          /*bias=*/false,
+          /*bias=*/true,
           /*add_bias_kv=*/false,
           /*add_zero_attn=*/false,
       };
@@ -46,13 +47,18 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
           DataType::FLOAT,
       });
 
+      std::vector<ParallelTensorAttrs> weights;
       ParallelTensorShape weights_shape = throw_if_unexpected(
           get_weights_shape(attrs, inputs_shape, inputs_shape, inputs_shape));
-      ParallelTensorAttrs weight_attrs =
-          ParallelTensorAttrs{weights_shape,
-                              /*sync_type=*/std::nullopt,
-                              /*initializer=*/std::nullopt,
-                              CreateGrad::YES};
+      weights.push_back(make_weight_attrs(weights_shape, std::nullopt));
+      ParallelTensorShape input_bias_shape =
+          throw_if_unexpected(get_input_bias_shape(
+              attrs, inputs_shape, inputs_shape, inputs_shape));
+      weights.push_back(make_weight_attrs(input_bias_shape, std::nullopt));
+      ParallelTensorShape output_bias_shape =
+          throw_if_unexpected(get_output_bias_shape(
+              attrs, inputs_shape, inputs_shape, inputs_shape));
+      weights.push_back(make_weight_attrs(output_bias_shape, std::nullopt));
 
       ParallelTensorShape output_shape = throw_if_unexpected(
           get_output_shape(attrs, inputs_shape, inputs_shape, inputs_shape));
@@ -66,7 +72,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
           PCGOperatorAttrs{attrs},
           std::vector<ParallelTensorShape>{
               inputs_shape, inputs_shape, inputs_shape},
-          std::vector<ParallelTensorAttrs>{weight_attrs},
+          weights,
           std::vector<ParallelTensorAttrs>{output_attrs},
           make_1d_machine_view(gpu_id_t{0}, gpu_id_t{1}));
 
diff --git a/lib/pcg/include/pcg/optimizer_attrs.h b/lib/pcg/include/pcg/optimizer_attrs.h
index 4b78f66fe4..d4abd1b52f 100644
--- a/lib/pcg/include/pcg/optimizer_attrs.h
+++ b/lib/pcg/include/pcg/optimizer_attrs.h
@@ -6,7 +6,7 @@
 
 namespace FlexFlow {
 
-OptimizerAttrs next(OptimizerAttrs const &old);
+OptimizerAttrs get_next_iteration_optimizer_attrs(OptimizerAttrs const &old);
 
 } // namespace FlexFlow
 
diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h
index 019b120936..35113553f2 100644
--- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h
+++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h
@@ -179,6 +179,10 @@ struct ParallelComputationGraphBuilder {
   ParallelComputationGraph pcg;
 };
 
+ParallelTensorAttrs
+    make_weight_attrs(ParallelTensorShape const &shape,
+                      std::optional<InitializerAttrs> const &initializer_attrs);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/pcg/src/pcg/optimizer_attrs.cc b/lib/pcg/src/pcg/optimizer_attrs.cc
index 5307450a68..8d66f7af7e 100644
--- a/lib/pcg/src/pcg/optimizer_attrs.cc
+++ b/lib/pcg/src/pcg/optimizer_attrs.cc
@@ -2,7 +2,8 @@
 
 namespace FlexFlow {
 
-OptimizerAttrs next(OptimizerAttrs const &old_attrs) {
+OptimizerAttrs
+    get_next_iteration_optimizer_attrs(OptimizerAttrs const &old_attrs) {
   if (old_attrs.has<AdamOptimizerAttrs>()) {
     AdamOptimizerAttrs old = old_attrs.get<AdamOptimizerAttrs>();
     double new_beta1_t = old.beta_t * old.beta1;
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
index ce00ea62f4..b56156fe8a 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
@@ -20,7 +20,7 @@ static std::string get_default_name(PCGOperatorAttrs const &attrs) {
   return get_default_name(get_op_type(attrs));
 }
 
-static ParallelTensorAttrs make_weight_attrs(
+ParallelTensorAttrs make_weight_attrs(
     ParallelTensorShape const &shape,
     std::optional<InitializerAttrs> const &initializer_attrs) {
   return ParallelTensorAttrs{

From 79ef4c964fa4abebf9813166353ecce230b83c75 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Tue, 22 Oct 2024 08:55:37 -0700
Subject: [PATCH 20/91] Refactor training backing and instance

---
 .../local-execution/local_slots_backing.h     |  13 +-
 .../local-execution/local_training_backing.h  |  26 +-
 .../local-execution/model_training_instance.h |  39 +++
 .../model_training_instance.struct.toml       |  25 --
 .../include/local-execution/task_registry.h   |   5 +
 .../src/local_cost_estimator.cc               |  39 ++-
 .../src/local_slots_backing.cc                |  64 +++--
 .../src/local_training_backing.cc             | 224 ++++++++----------
 .../src/model_training_instance.cc            |  64 +++++
 lib/local-execution/src/task_registry.cc      |  24 +-
 .../test/src/test_local_slots_backing.cc      |  32 ++-
 lib/local-execution/test/src/test_loss_e2e.cc |  96 +++-----
 .../test/src/test_update_e2e.cc               |  77 ++----
 .../include/pcg/computation_graph_builder.h   |   7 +
 lib/pcg/src/pcg/computation_graph_builder.cc  |  14 +-
 15 files changed, 402 insertions(+), 347 deletions(-)
 create mode 100644 lib/local-execution/include/local-execution/model_training_instance.h
 delete mode 100644 lib/local-execution/include/local-execution/model_training_instance.struct.toml
 create mode 100644 lib/local-execution/src/model_training_instance.cc

diff --git a/lib/local-execution/include/local-execution/local_slots_backing.h b/lib/local-execution/include/local-execution/local_slots_backing.h
index d201d3c405..46e66e97a2 100644
--- a/lib/local-execution/include/local-execution/local_slots_backing.h
+++ b/lib/local-execution/include/local-execution/local_slots_backing.h
@@ -9,6 +9,7 @@
 #include "local-execution/per_device_op_state.h"
 #include "local-execution/runtime_arg_config.h"
 #include "local-execution/task_invocation.dtg.h"
+#include "local-execution/tensor_role.dtg.h"
 #include "local-execution/unified_tensor_guid.dtg.h"
 #include "pcg/computation_graph.dtg.h"
 #include "pcg/layer_guid_t.dtg.h"
@@ -27,9 +28,15 @@ struct LocalSlotsBacking {
 public:
   void add_per_device_op_state(layer_guid_t const &,
                                DeviceSpecificDeviceStates const &);
-  void allocate_outgoing_tensors(layer_guid_t const &,
-                                 ComputationGraph const &,
-                                 Allocator &);
+  void insert_into_tensor_mapping(tensor_guid_t const &,
+                                  GenericTensorAccessorW const &);
+  void allocate_layer_tensors(layer_guid_t const &,
+                              ComputationGraph const &,
+                              Allocator &);
+  void allocate_tensors_by_role(TensorRole const &,
+                                layer_guid_t const &,
+                                ComputationGraph const &,
+                                Allocator &);
   void allocate_optimizer_tensors(layer_guid_t const &weight_layer,
                                   tensor_guid_t const &,
                                   ComputationGraph const &,
diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h
index 2313d55732..6dfa8ad443 100644
--- a/lib/local-execution/include/local-execution/local_training_backing.h
+++ b/lib/local-execution/include/local-execution/local_training_backing.h
@@ -2,8 +2,8 @@
 #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H
 
 #include "local-execution/local_slots_backing.h"
-#include "local-execution/model_training_instance.dtg.h"
 #include "local-execution/task_registry.h"
+#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
 #include "pcg/computation_graph.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
 
@@ -16,19 +16,25 @@ struct LocalTrainingBacking {
   LocalTrainingBacking(Allocator const &,
                        ComputationGraph const &,
                        TensorBackingMap const &,
-                       RuntimeArgConfig const &,
-                       std::optional<ModelTrainingInstance> const &,
-                       std::optional<OptimizerAttrs> const &);
-
-  void execute_init();
-  PerLayerElapsedTime execute_forward();
-  PerLayerElapsedTime execute_backward();
-  void execute_update();
+                       RuntimeArgConfig const &);
+  void register_and_allocate_layer(layer_guid_t const &);
+  void allocate_layer_optimizer_tensors(layer_guid_t const &,
+                                        OptimizerAttrs const &);
+
+  void execute_init(layer_guid_t const &);
+  std::optional<float> execute_forward(layer_guid_t const &);
+  void compute_loss(LossAttrs const &loss_attrs,
+                    tensor_guid_t const &logit_tensor,
+                    tensor_guid_t const &label_tensor);
+  std::optional<float> execute_backward(layer_guid_t const &);
+  void execute_update(layer_guid_t const &, OptimizerAttrs const &);
 
   TaskArgumentAccessor get_task_arg_accessor(TaskInvocation const &) const;
   TaskArgumentAccessor get_op_task_arg_accessor(OpTaskInvocation const &,
                                                 layer_guid_t const &) const;
 
+  void insert_tensor(tensor_guid_t const &, GenericTensorAccessorW const &);
+
 private:
   DeviceSpecificDeviceStates call_init_task_impl(task_id_t,
                                                  TaskArgumentAccessor const &);
@@ -39,8 +45,6 @@ struct LocalTrainingBacking {
   ComputationGraph computation_graph;
   TaskRegistry task_registry;
   LocalSlotsBacking local_slots_backing;
-  std::optional<ModelTrainingInstance> training_instance;
-  std::optional<OptimizerAttrs> optimizer_attrs;
 };
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h
new file mode 100644
index 0000000000..08f373a16f
--- /dev/null
+++ b/lib/local-execution/include/local-execution/model_training_instance.h
@@ -0,0 +1,39 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_MODEL_TRAINING_INSTANCE_H
+#define _FLEXFLOW_LOCAL_EXECUTION_MODEL_TRAINING_INSTANCE_H
+
+#include "local-execution/local_training_backing.h"
+#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
+
+namespace FlexFlow {
+
+using PerLayerElapsedTime =
+    std::unordered_map<layer_guid_t, std::optional<float>>;
+
+struct ModelTrainingInstance {
+  ModelTrainingInstance(Allocator const &,
+                        ComputationGraph const &,
+                        TensorBackingMap const &,
+                        RuntimeArgConfig const &,
+                        LossAttrs const &,
+                        tensor_guid_t const & logit_tensor,
+                        tensor_guid_t const & label_tensor,
+                        OptimizerAttrs const &);
+
+  void register_and_allocate_layers();
+  void allocate_optimizer_tensors();
+  void execute_init();
+  PerLayerElapsedTime execute_forward();
+  PerLayerElapsedTime execute_backward();
+  void execute_update();
+
+  ComputationGraph computation_graph;
+  LocalTrainingBacking training_backing;
+  LossAttrs loss_attrs;
+  tensor_guid_t logit_tensor;
+  tensor_guid_t label_tensor;
+  OptimizerAttrs optimizer_attrs;
+};
+
+}
+
+#endif
diff --git a/lib/local-execution/include/local-execution/model_training_instance.struct.toml b/lib/local-execution/include/local-execution/model_training_instance.struct.toml
deleted file mode 100644
index dcfaf2175d..0000000000
--- a/lib/local-execution/include/local-execution/model_training_instance.struct.toml
+++ /dev/null
@@ -1,25 +0,0 @@
-namespace = "FlexFlow"
-name = "ModelTrainingInstance"
-features = [
-  "eq",
-  "ord",
-  "hash",
-  "fmt",
-]
-
-includes = [
-  "op-attrs/ops/loss_functions/loss_attrs.dtg.h",
-  "pcg/tensor_guid_t.dtg.h"
-]
-
-[[fields]]
-name = "loss_attrs"
-type = "::FlexFlow::LossAttrs"
-
-[[fields]]
-name = "label_tensor"
-type = "::FlexFlow::tensor_guid_t"
-
-[[fields]]
-name = "logit_tensor"
-type = "::FlexFlow::tensor_guid_t"
diff --git a/lib/local-execution/include/local-execution/task_registry.h b/lib/local-execution/include/local-execution/task_registry.h
index e00cc183da..24790a28e3 100644
--- a/lib/local-execution/include/local-execution/task_registry.h
+++ b/lib/local-execution/include/local-execution/task_registry.h
@@ -2,6 +2,7 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_REGISTRY_H
 #define _FLEXFLOW_LOCAL_EXECUTION_TASK_REGISTRY_H
 
+#include "local-execution/op_task_type.dtg.h"
 #include "local-execution/task_registry.dtg.h"
 #include "op-attrs/computation_graph_op_attrs.h"
 
@@ -13,6 +14,10 @@ void register_tasks_for_layer(TaskRegistry &,
                               layer_guid_t const &,
                               ComputationGraphOpAttrs const &attrs);
 
+bool registry_contains_op_task(TaskRegistry const &,
+                               layer_guid_t const &,
+                               OpTaskType const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
index 186c2d516a..c99a2b154f 100644
--- a/lib/local-execution/src/local_cost_estimator.cc
+++ b/lib/local-execution/src/local_cost_estimator.cc
@@ -4,6 +4,7 @@
 #include "local-execution/tracked_allocator.h"
 #include "op-attrs/computation_graph_op_attrs.h"
 #include "op-attrs/pcg_operator_attrs.h"
+#include "pcg/computation_graph/layer_added_result.dtg.h"
 #include "pcg/computation_graph_builder.h"
 #include "pcg/parallel_tensor_attrs.h"
 #include "utils/containers/transform.h"
@@ -66,29 +67,27 @@ CostDetails LocalCostEstimator::estimate_cost(
       };
 
   // add operator to graph
-  std::vector<tensor_guid_t> output_tensor_ids =
-      cg_builder.add_layer(layer_attrs,
-                           input_tensor_ids,
-                           transform(get_vector_piece_attrs(weights),
-                                     [&](TensorAttrs const &a) {
-                                       return cg_builder.create_weight(a);
-                                     }),
-                           get_vector_piece_attrs(outputs));
-
-  std::optional<ModelTrainingInstance> model_training_instance = std::nullopt;
-  std::optional<OptimizerAttrs> optimizer_attrs = std::nullopt;
+  LayerAddedResult layer_added_result =
+      cg_builder.add_layer_and_get_layer_added_result(
+          layer_attrs,
+          input_tensor_ids,
+          transform(get_vector_piece_attrs(weights),
+                    [&](TensorAttrs const &a) {
+                      return cg_builder.create_weight(a);
+                    }),
+          get_vector_piece_attrs(outputs));
+
   LocalTrainingBacking local_backing(allocator,
                                      cg_builder.computation_graph,
                                      tensor_backing_map,
-                                     this->runtime_arg_config,
-                                     model_training_instance,
-                                     optimizer_attrs);
-
-  local_backing.execute_init();
-  PerLayerElapsedTime fwd = local_backing.execute_forward();
-  PerLayerElapsedTime bwd = local_backing.execute_backward();
-
-  return CostDetails{get_total_elapsed_time(fwd, bwd),
+                                     this->runtime_arg_config);
+  local_backing.register_and_allocate_layer(layer_added_result.layer);
+  local_backing.execute_init(layer_added_result.layer);
+  float fwd = local_backing.execute_forward(layer_added_result.layer).value();
+  float bwd = local_backing.execute_backward(layer_added_result.layer).value();
+  float total_execution_time = fwd + bwd;
+
+  return CostDetails{total_execution_time,
                      tracked_allocator_ptr->get_current_mem_usage()};
 }
 
diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc
index f10b7c0126..25abc72567 100644
--- a/lib/local-execution/src/local_slots_backing.cc
+++ b/lib/local-execution/src/local_slots_backing.cc
@@ -18,39 +18,65 @@ void LocalSlotsBacking::add_per_device_op_state(
   this->per_device_op_states.insert({op_guid, device_state});
 }
 
-void LocalSlotsBacking::allocate_outgoing_tensors(
+void LocalSlotsBacking::insert_into_tensor_mapping(
+    tensor_guid_t const &tensor, GenericTensorAccessorW const &tensor_backing) {
+  if (!contains_key(this->tensor_mapping, tensor)) {
+    this->tensor_mapping.insert({tensor, tensor_backing});
+  }
+}
+
+void LocalSlotsBacking::allocate_layer_tensors(
     layer_guid_t const &layer_guid,
     ComputationGraph const &computation_graph,
     Allocator &allocator) {
-  std::vector<tensor_guid_t> incoming_input_tensors =
-      get_incoming_inputs(computation_graph, layer_guid);
-  std::vector<tensor_guid_t> incoming_weight_tensors =
-      get_incoming_weights(computation_graph, layer_guid);
-  std::vector<tensor_guid_t> outgoing_tensors =
-      get_outgoing_tensors(computation_graph, layer_guid);
-  for (tensor_guid_t const &output_tensor : outgoing_tensors) {
-    TensorAttrs tensor_attrs =
-        get_tensor_attrs(computation_graph, output_tensor);
+  this->allocate_tensors_by_role(
+      TensorRole::INPUT, layer_guid, computation_graph, allocator);
+  this->allocate_tensors_by_role(
+      TensorRole::WEIGHT, layer_guid, computation_graph, allocator);
+  this->allocate_tensors_by_role(
+      TensorRole::OUTPUT, layer_guid, computation_graph, allocator);
+}
+
+void LocalSlotsBacking::allocate_tensors_by_role(
+    TensorRole const &role,
+    layer_guid_t const &layer_guid,
+    ComputationGraph const &computation_graph,
+    Allocator &allocator) {
+  std::vector<tensor_guid_t> tensors;
+  switch (role) {
+    case TensorRole::INPUT:
+      tensors = get_incoming_inputs(computation_graph, layer_guid);
+      this->input_tensor_slots.insert({layer_guid, tensors});
+      break;
+    case TensorRole::WEIGHT:
+      tensors = get_incoming_weights(computation_graph, layer_guid);
+      this->weight_tensor_slots.insert({layer_guid, tensors});
+      break;
+    case TensorRole::OUTPUT:
+      tensors = get_outgoing_tensors(computation_graph, layer_guid);
+      this->output_tensor_slots.insert({layer_guid, tensors});
+      break;
+    default:
+      throw mk_runtime_error("Invalid tensor role, got {}", role);
+  }
+
+  for (tensor_guid_t const &tensor : tensors) {
+    TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor);
     // tensor allocation
-    if (!is_tensor_allocated(output_tensor)) {
+    if (!is_tensor_allocated(tensor)) {
       GenericTensorAccessorW tensor_backing =
           allocator.allocate_tensor(tensor_attrs.shape);
-      this->tensor_mapping.insert({output_tensor, tensor_backing});
+      this->tensor_mapping.insert({tensor, tensor_backing});
     }
 
     // gradient tensor allocation
     if (tensor_attrs.create_gradients == CreateGrad::YES &&
-        !is_gradient_tensor_allocated(output_tensor)) {
+        !is_gradient_tensor_allocated(tensor)) {
       GenericTensorAccessorW gradient_tensor_backing =
           allocator.allocate_tensor(tensor_attrs.shape);
-      this->gradient_tensor_mapping.insert(
-          {output_tensor, gradient_tensor_backing});
+      this->gradient_tensor_mapping.insert({tensor, gradient_tensor_backing});
     }
   }
-
-  this->input_tensor_slots.insert({layer_guid, incoming_input_tensors});
-  this->weight_tensor_slots.insert({layer_guid, incoming_weight_tensors});
-  this->output_tensor_slots.insert({layer_guid, outgoing_tensors});
 }
 
 void LocalSlotsBacking::allocate_optimizer_tensors(
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index b7631470b7..0cb8146467 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -8,7 +8,6 @@
 #include "utils/containers/contains.h"
 #include "utils/containers/contains_key.h"
 #include "utils/containers/get_only.h"
-#include "utils/containers/reversed.h"
 #include "utils/exception.h"
 
 namespace FlexFlow {
@@ -17,42 +16,30 @@ LocalTrainingBacking::LocalTrainingBacking(
     Allocator const &allocator,
     ComputationGraph const &computation_graph,
     TensorBackingMap const &tensor_backing_mapping,
-    RuntimeArgConfig const &runtime_arg_config,
-    std::optional<ModelTrainingInstance> const &training_instance,
-    std::optional<OptimizerAttrs> const &optimizer_attrs)
+    RuntimeArgConfig const &runtime_arg_config)
     : allocator(allocator), computation_graph(computation_graph),
       local_slots_backing(tensor_backing_mapping, runtime_arg_config),
-      task_registry(empty_task_registry()),
-      training_instance(training_instance), optimizer_attrs(optimizer_attrs) {
-
-  for (layer_guid_t const &node :
-       topological_ordering(this->computation_graph)) {
-    ComputationGraphOpAttrs attrs =
-        get_layer_attrs(this->computation_graph, node).attrs;
-
-    // allocate outgoing tensors
-    this->local_slots_backing.allocate_outgoing_tensors(
-        node, this->computation_graph, this->allocator);
-
-    // register tasks
-    register_tasks_for_layer(this->task_registry, node, attrs);
-
-    // allocate optimizer buffers
-    if (attrs.has<WeightAttrs>() && this->training_instance.has_value()) {
-      TaskSignature sig = get_update_signature(this->optimizer_attrs.value());
-      tensor_guid_t weight_tensor =
-          get_only(get_outgoing_tensors(this->computation_graph, node));
-      this->local_slots_backing.allocate_optimizer_tensors(
-          node, weight_tensor, this->computation_graph, this->allocator, sig);
-    }
-  }
+      task_registry(empty_task_registry()) {}
+
+void LocalTrainingBacking::register_and_allocate_layer(
+    layer_guid_t const &node) {
+  ComputationGraphOpAttrs attrs =
+      get_layer_attrs(this->computation_graph, node).attrs;
+  this->local_slots_backing.allocate_layer_tensors(
+      node, this->computation_graph, this->allocator);
+  register_tasks_for_layer(this->task_registry, node, attrs);
+}
 
-  if (this->training_instance.has_value()) {
-    // label and logit tensor should be allocated
-    assert(this->local_slots_backing.is_tensor_allocated(
-        this->training_instance.value().label_tensor));
-    assert(this->local_slots_backing.is_tensor_allocated(
-        this->training_instance.value().logit_tensor));
+void LocalTrainingBacking::allocate_layer_optimizer_tensors(
+    layer_guid_t const &node, OptimizerAttrs const &optimizer_attrs) {
+  ComputationGraphOpAttrs attrs =
+      get_layer_attrs(this->computation_graph, node).attrs;
+  if (attrs.has<WeightAttrs>()) {
+    TaskSignature sig = get_update_signature(optimizer_attrs);
+    tensor_guid_t weight_tensor =
+        get_only(get_outgoing_tensors(this->computation_graph, node));
+    this->local_slots_backing.allocate_optimizer_tensors(
+        node, weight_tensor, this->computation_graph, this->allocator, sig);
   }
 }
 
@@ -76,110 +63,88 @@ std::optional<float>
   return fn(acc);
 }
 
-void LocalTrainingBacking::execute_init() {
-  for (layer_guid_t const &operator_node :
-       topological_ordering(this->computation_graph)) {
-    if (this->task_registry.init_task_ids.at(operator_node).has_value()) {
-      ComputationGraphOpAttrs attrs =
-          get_layer_attrs(this->computation_graph, operator_node).attrs;
-
-      OpTaskInvocation invocation = init(attrs);
-      TaskArgumentAccessor accessor =
-          this->get_op_task_arg_accessor(invocation, operator_node);
-      DeviceSpecificDeviceStates device_state =
-          this->call_init_task_impl(invocation.task_id, accessor);
-      this->local_slots_backing.add_per_device_op_state(operator_node,
-                                                        device_state);
-    }
+void LocalTrainingBacking::execute_init(layer_guid_t const &operator_node) {
+  if (registry_contains_op_task(
+          this->task_registry, operator_node, OpTaskType::INIT)) {
+    ComputationGraphOpAttrs attrs =
+        get_layer_attrs(this->computation_graph, operator_node).attrs;
+
+    OpTaskInvocation invocation = init(attrs);
+    TaskArgumentAccessor accessor =
+        this->get_op_task_arg_accessor(invocation, operator_node);
+    DeviceSpecificDeviceStates device_state =
+        this->call_init_task_impl(invocation.task_id, accessor);
+    this->local_slots_backing.add_per_device_op_state(operator_node,
+                                                      device_state);
   }
 }
 
-PerLayerElapsedTime LocalTrainingBacking::execute_forward() {
-  PerLayerElapsedTime per_op_elapsed_time;
-
-  for (layer_guid_t const &operator_node :
-       topological_ordering(this->computation_graph)) {
-    if (this->task_registry.forward_task_ids.at(operator_node).has_value()) {
-      ComputationGraphOpAttrs attrs =
-          get_layer_attrs(this->computation_graph, operator_node).attrs;
-
-      OpTaskInvocation invocation = forward(attrs);
-      TaskArgumentAccessor accessor =
-          this->get_op_task_arg_accessor(invocation, operator_node);
-      std::optional<float> elapsed_time =
-          this->call_task_impl(invocation.task_id, accessor);
-      per_op_elapsed_time.insert({operator_node, elapsed_time});
-    }
+std::optional<float>
+    LocalTrainingBacking::execute_forward(layer_guid_t const &operator_node) {
+  if (registry_contains_op_task(
+          this->task_registry, operator_node, OpTaskType::FWD)) {
+    ComputationGraphOpAttrs attrs =
+        get_layer_attrs(this->computation_graph, operator_node).attrs;
+
+    OpTaskInvocation invocation = forward(attrs);
+    TaskArgumentAccessor accessor =
+        this->get_op_task_arg_accessor(invocation, operator_node);
+    return this->call_task_impl(invocation.task_id, accessor);
+  } else {
+    return std::nullopt;
   }
-
-  return per_op_elapsed_time;
 }
 
-PerLayerElapsedTime LocalTrainingBacking::execute_backward() {
-  PerLayerElapsedTime per_op_elapsed_time;
-
-  // compute loss
-  if (this->training_instance.has_value()) {
-    ModelTrainingInstance unwrapped_training_instance =
-        training_instance.value();
-    TaskInvocation loss_invocation =
-        backward(unwrapped_training_instance.loss_attrs,
-                 unwrapped_training_instance.logit_tensor,
-                 unwrapped_training_instance.label_tensor);
-    // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation));
-    TaskArgumentAccessor loss_accessor =
-        this->get_task_arg_accessor(loss_invocation);
-    TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl();
-    loss_impl_fn.get<GenericTaskImplFunction>().function_ptr(loss_accessor);
-  }
+void LocalTrainingBacking::compute_loss(LossAttrs const &loss_attrs,
+                                        tensor_guid_t const &logit_tensor,
+                                        tensor_guid_t const &label_tensor) {
+  assert(this->local_slots_backing.is_tensor_allocated(logit_tensor) &&
+         this->local_slots_backing.is_tensor_allocated(label_tensor));
+  TaskInvocation loss_invocation =
+      backward(loss_attrs, logit_tensor, label_tensor);
+  // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation));
+  TaskArgumentAccessor loss_accessor =
+      this->get_task_arg_accessor(loss_invocation);
+  TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl();
+  loss_impl_fn.get<GenericTaskImplFunction>().function_ptr(loss_accessor);
+}
 
-  // backward through computation graph
-  for (layer_guid_t const &operator_node :
-       reversed(topological_ordering(this->computation_graph))) {
-    if (this->task_registry.backward_task_ids.at(operator_node).has_value()) {
-      ComputationGraphOpAttrs attrs =
-          get_layer_attrs(this->computation_graph, operator_node).attrs;
-
-      OpTaskInvocation invocation = backward(attrs);
-      TaskArgumentAccessor accessor =
-          this->get_op_task_arg_accessor(invocation, operator_node);
-      std::optional<float> elapsed_time =
-          this->call_task_impl(invocation.task_id, accessor);
-      per_op_elapsed_time.insert({operator_node, elapsed_time});
-    }
+std::optional<float>
+    LocalTrainingBacking::execute_backward(layer_guid_t const &operator_node) {
+  if (registry_contains_op_task(
+          this->task_registry, operator_node, OpTaskType::BWD)) {
+    ComputationGraphOpAttrs attrs =
+        get_layer_attrs(this->computation_graph, operator_node).attrs;
+
+    OpTaskInvocation invocation = backward(attrs);
+    TaskArgumentAccessor accessor =
+        this->get_op_task_arg_accessor(invocation, operator_node);
+    return this->call_task_impl(invocation.task_id, accessor);
+  } else {
+    return std::nullopt;
   }
-  return per_op_elapsed_time;
 }
 
-void LocalTrainingBacking::execute_update() {
-  assert(this->training_instance.has_value());
-  assert(this->optimizer_attrs.has_value());
-
-  for (layer_guid_t const &node :
-       topological_ordering(this->computation_graph)) {
-    LayerAttrs layer_attrs = get_layer_attrs(this->computation_graph, node);
-    if (layer_attrs.attrs.has<WeightAttrs>()) {
-      // get tensors
-      tensor_guid_t weight_tensor =
-          get_only(get_outgoing_tensors(this->computation_graph, node));
-      std::vector<non_graph_tensor_guid_t> grad_buffer_tensors =
-          this->local_slots_backing.weight_optimizer_tensor_guids.at(node);
-
-      // get invocation
-      TaskInvocation invocation = get_update_invocation(
-          this->optimizer_attrs.value(), weight_tensor, grad_buffer_tensors);
-      // assert(is_invocation_valid(get_update_signature(attrs), invocation));
-
-      // execute update
-      TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation);
-      TaskImplFunction update_impl_fn =
-          get_update_task_impl(this->optimizer_attrs.value());
-      update_impl_fn.get<GenericTaskImplFunction>().function_ptr(accessor);
-    }
+void LocalTrainingBacking::execute_update(
+    layer_guid_t const &node, OptimizerAttrs const &optimizer_attrs) {
+  LayerAttrs layer_attrs = get_layer_attrs(this->computation_graph, node);
+  if (layer_attrs.attrs.has<WeightAttrs>()) {
+    // get tensors
+    tensor_guid_t weight_tensor =
+        get_only(get_outgoing_tensors(this->computation_graph, node));
+    std::vector<non_graph_tensor_guid_t> grad_buffer_tensors =
+        this->local_slots_backing.weight_optimizer_tensor_guids.at(node);
+
+    // get invocation
+    TaskInvocation invocation = get_update_invocation(
+        optimizer_attrs, weight_tensor, grad_buffer_tensors);
+    // assert(is_invocation_valid(get_update_signature(attrs), invocation));
+
+    // execute update
+    TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation);
+    TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs);
+    update_impl_fn.get<GenericTaskImplFunction>().function_ptr(accessor);
   }
-
-  this->optimizer_attrs =
-      get_next_iteration_optimizer_attrs(this->optimizer_attrs.value());
 }
 
 TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor(
@@ -206,4 +171,9 @@ TaskArgumentAccessor LocalTrainingBacking::get_op_task_arg_accessor(
       this->allocator, tensor_slots_backing, arg_slots_backing);
 }
 
+void LocalTrainingBacking::insert_tensor(
+    tensor_guid_t const &tensor, GenericTensorAccessorW const &tensor_backing) {
+  this->local_slots_backing.insert_into_tensor_mapping(tensor, tensor_backing);
+}
+
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc
new file mode 100644
index 0000000000..7256a82478
--- /dev/null
+++ b/lib/local-execution/src/model_training_instance.cc
@@ -0,0 +1,64 @@
+#include "local-execution/model_training_instance.h"
+#include "pcg/computation_graph.h"
+#include "utils/containers/reversed.h"
+#include "pcg/optimizer_attrs.h"
+
+namespace FlexFlow {
+  
+ModelTrainingInstance::ModelTrainingInstance(Allocator const & allocator, 
+                                             ComputationGraph const & computation_graph, 
+                                             TensorBackingMap const & tensor_backing_map, 
+                                             RuntimeArgConfig const & runtime_arg_config, 
+                                             LossAttrs const & loss_attrs, 
+                                             tensor_guid_t const &logit_tensor, 
+                                             tensor_guid_t const &label_tensor, 
+                                             OptimizerAttrs const & optimizer_attrs) 
+  : computation_graph(computation_graph), training_backing(allocator, computation_graph, tensor_backing_map, runtime_arg_config),
+  loss_attrs(loss_attrs), logit_tensor(logit_tensor), label_tensor(label_tensor), optimizer_attrs(optimizer_attrs) {}
+
+void ModelTrainingInstance::register_and_allocate_layers() {
+  for (layer_guid_t const & node: topological_ordering(this->computation_graph)) {
+    this->training_backing.register_and_allocate_layer(node);
+  }
+}
+
+void ModelTrainingInstance::allocate_optimizer_tensors() {
+  for (layer_guid_t const & node: topological_ordering(this->computation_graph)) {
+    this->training_backing.allocate_layer_optimizer_tensors(node, this->optimizer_attrs);
+  }
+}
+
+void ModelTrainingInstance::execute_init() {
+  for (layer_guid_t const & node: topological_ordering(this->computation_graph)) {
+    this->training_backing.execute_init(node);
+  }
+}
+
+PerLayerElapsedTime ModelTrainingInstance::execute_forward() {
+  PerLayerElapsedTime per_layer_elapsed_time;
+  for (layer_guid_t const & node: topological_ordering(this->computation_graph)) {
+    std::optional<float> elapsed_time = this->training_backing.execute_forward(node);
+    per_layer_elapsed_time.insert({node, elapsed_time});
+  }
+  return per_layer_elapsed_time;
+}
+
+PerLayerElapsedTime ModelTrainingInstance::execute_backward() {
+  this->training_backing.compute_loss(this->loss_attrs, this->logit_tensor, this->label_tensor);
+  
+  PerLayerElapsedTime per_layer_elapsed_time;
+  for (layer_guid_t const & node: reversed(topological_ordering(this->computation_graph))) {
+    std::optional<float> elapsed_time = this->training_backing.execute_backward(node);
+    per_layer_elapsed_time.insert({node, elapsed_time});
+  }
+  return per_layer_elapsed_time;
+}
+
+void ModelTrainingInstance::execute_update() {
+  for (layer_guid_t const & node: topological_ordering(this->computation_graph)) {
+    this->training_backing.execute_update(node, this->optimizer_attrs);
+  }
+  this->optimizer_attrs = get_next_iteration_optimizer_attrs(this->optimizer_attrs);
+}
+
+}
diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc
index dad5c1fc69..3cd2cccae8 100644
--- a/lib/local-execution/src/task_registry.cc
+++ b/lib/local-execution/src/task_registry.cc
@@ -35,10 +35,32 @@ void register_tasks_for_layer(TaskRegistry &task_registry,
         task_registry.backward_task_ids[op_id] = task_id;
         break;
       default:
-        throw mk_runtime_error("Invalid OpTaskType");
+        throw mk_runtime_error("Invalid OpTaskType, got {}",
+                               task_signature_impl.task_signature.type);
     }
     task_registry.task_mapping.insert({task_id, task_signature_impl});
   }
 }
 
+bool registry_contains_op_task(TaskRegistry const &task_registry,
+                               layer_guid_t const &op,
+                               OpTaskType const &op_task_type) {
+  std::unordered_map<layer_guid_t, std::optional<task_id_t>> task_ids;
+  switch (op_task_type) {
+    case OpTaskType::INIT:
+      task_ids = task_registry.init_task_ids;
+      break;
+    case OpTaskType::FWD:
+      task_ids = task_registry.forward_task_ids;
+      break;
+    case OpTaskType::BWD:
+      task_ids = task_registry.backward_task_ids;
+      break;
+    default:
+      throw mk_runtime_error("Invalid OpTaskType, got {}", op_task_type);
+  }
+
+  return task_ids.at(op).has_value();
+}
+
 } // namespace FlexFlow
diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc
index 779ba43f26..5d58e7e757 100644
--- a/lib/local-execution/test/src/test_local_slots_backing.cc
+++ b/lib/local-execution/test/src/test_local_slots_backing.cc
@@ -81,7 +81,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     LocalSlotsBacking local_slots_backing = {tensor_backing_map,
                                              runtime_arg_config};
 
-    SUBCASE("LocalSlotsBacking::allocate_outgoing_tensors") {
+    SUBCASE("LocalSlotsBacking::allocate_tensors_by_role") {
       auto get_result_shape_and_dtype_for_tensor_guid_and_map =
           [&](tensor_guid_t t,
               TensorBackingMap m) -> std::pair<ArrayShape, DataType> {
@@ -92,14 +92,11 @@ TEST_SUITE(FF_TEST_SUITE) {
       SUBCASE("Input (QKV) and gradient tensors allocation") {
 
         // allocate all tensors from input nodes
-        for (layer_guid_t const &node :
-             topological_ordering(cg_builder.computation_graph)) {
-          if (node == layer_guid) {
-            break;
-          }
-          local_slots_backing.allocate_outgoing_tensors(
-              node, cg_builder.computation_graph, allocator);
-        }
+        local_slots_backing.allocate_tensors_by_role(
+            TensorRole::INPUT,
+            layer_guid,
+            cg_builder.computation_graph,
+            allocator);
 
         SUBCASE("Query grad") {
           std::pair<ArrayShape, DataType> result =
@@ -127,8 +124,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         }
       }
       SUBCASE("Output and gradient tensors allocation") {
-        local_slots_backing.allocate_outgoing_tensors(
-            layer_guid, cg_builder.computation_graph, allocator);
+        local_slots_backing.allocate_tensors_by_role(
+            TensorRole::OUTPUT,
+            layer_guid,
+            cg_builder.computation_graph,
+            allocator);
         SUBCASE("Output") {
           std::pair<ArrayShape, DataType> result =
               get_result_shape_and_dtype_for_tensor_guid_and_map(
@@ -154,7 +154,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
 
       SUBCASE("Tensor slots") {
-        local_slots_backing.allocate_outgoing_tensors(
+        local_slots_backing.allocate_layer_tensors(
             layer_guid, cg_builder.computation_graph, allocator);
         SUBCASE("Input tensor slots") {
           std::vector<tensor_guid_t> correct_incoming_input_tensors =
@@ -211,12 +211,8 @@ TEST_SUITE(FF_TEST_SUITE) {
         return b;
       }();
 
-      // allocate all incoming and outgoing tensors for graph
-      for (layer_guid_t const &node :
-           topological_ordering(cg_builder.computation_graph)) {
-        local_slots_backing.allocate_outgoing_tensors(
-            node, cg_builder.computation_graph, allocator);
-      }
+      local_slots_backing.allocate_layer_tensors(
+          layer_guid, cg_builder.computation_graph, allocator);
 
       SUBCASE("LocalSlotsBacking::construct_tensor_slots_backing") {
         TensorSlotsBackingWithoutAddresses result =
diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc
index 37024adc26..c4662d624c 100644
--- a/lib/local-execution/test/src/test_loss_e2e.cc
+++ b/lib/local-execution/test/src/test_loss_e2e.cc
@@ -3,6 +3,8 @@
 #include "kernels/managed_ff_stream.h"
 #include "kernels/managed_per_device_ff_handle.h"
 #include "local-execution/local_training_backing.h"
+#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
+#include "pcg/computation_graph.h"
 #include "pcg/computation_graph_builder.h"
 #include "pcg/optimizer_attrs.dtg.h"
 #include "test_utils.h"
@@ -19,12 +21,6 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         EnableProfiling::YES,
         ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}};
 
-    OptimizerAttrs optimizer_attrs =
-        OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.0,
-                                         /*momentum=*/0.0,
-                                         /*nesterov=*/false,
-                                         /*weight_decay=*/0.0}};
-
     // construct graph
     ComputationGraphBuilder cg_builder;
 
@@ -36,8 +32,9 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         cg_builder.create_input(input_shape, CreateGrad::YES);
 
     float scalar = 4.0;
+    std::string layer_name = "scalar multiply";
     tensor_guid_t logit_tensor =
-        cg_builder.scalar_multiply(input_tensor, scalar);
+        cg_builder.scalar_multiply(input_tensor, scalar, layer_name);
 
     // allocate memory
     Allocator allocator = create_local_cuda_memory_allocator();
@@ -46,6 +43,17 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         allocator.allocate_tensor(input_shape);
     tensor_backing_map.insert({input_tensor, input_backing});
 
+    LocalTrainingBacking local_backing(allocator,
+                                       cg_builder.computation_graph,
+                                       tensor_backing_map,
+                                       runtime_arg_config);
+    // for (layer_guid_t const & node:
+    // topological_ordering(cg_builder.computation_graph)) {
+    //   local_backing.register_and_allocate_layer(node);
+    // }
+    local_backing.register_and_allocate_layer(
+        get_layer_by_name(cg_builder.computation_graph, layer_name));
+
     SUBCASE("SparseCategoricalCrossEntropyLossAttrs") {
       TensorShape label_shape = TensorShape{
           TensorDims{FFOrdered<size_t>{batch_size, 1}}, DataType::FLOAT};
@@ -53,22 +61,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
           cg_builder.create_input(label_shape, CreateGrad::NO);
       GenericTensorAccessorW label_backing =
           allocator.allocate_tensor(label_shape);
-      tensor_backing_map.insert({label_tensor, label_backing});
-      std::optional<ModelTrainingInstance> model_training_instance =
-          ModelTrainingInstance{
-              LossAttrs{SparseCategoricalCrossEntropyLossAttrs{
-                  /*replace_labels=*/false}},
-              label_tensor,
-              logit_tensor};
-      LocalTrainingBacking local_backing(allocator,
-                                         cg_builder.computation_graph,
-                                         tensor_backing_map,
-                                         runtime_arg_config,
-                                         model_training_instance,
-                                         optimizer_attrs);
-      local_backing.execute_init();
-      local_backing.execute_forward();
-      local_backing.execute_backward();
+      local_backing.insert_tensor(label_tensor, label_backing);
+      LossAttrs loss_attrs = LossAttrs{
+          SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}};
+      local_backing.compute_loss(loss_attrs, logit_tensor, label_tensor);
     }
 
     SUBCASE("NonconfigurableLossAttrs") {
@@ -76,58 +72,24 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
           cg_builder.create_input(input_shape, CreateGrad::NO);
       GenericTensorAccessorW label_backing =
           allocator.allocate_tensor(input_shape);
-      tensor_backing_map.insert({label_tensor, label_backing});
+      local_backing.insert_tensor(label_tensor, label_backing);
 
       SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") {
-        std::optional<ModelTrainingInstance> model_training_instance =
-            ModelTrainingInstance{LossAttrs{NonconfigurableLossAttrs{
-                                      LossFunction::CATEGORICAL_CROSSENTROPY}},
-                                  label_tensor,
-                                  logit_tensor};
-        LocalTrainingBacking local_backing(allocator,
-                                           cg_builder.computation_graph,
-                                           tensor_backing_map,
-                                           runtime_arg_config,
-                                           model_training_instance,
-                                           optimizer_attrs);
-        local_backing.execute_init();
-        local_backing.execute_forward();
-        local_backing.execute_backward();
+        LossAttrs loss_attrs = LossAttrs{
+            NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
+        local_backing.compute_loss(loss_attrs, logit_tensor, label_tensor);
       }
 
       SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") {
-        std::optional<ModelTrainingInstance> model_training_instance =
-            ModelTrainingInstance{
-                LossAttrs{NonconfigurableLossAttrs{
-                    LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}},
-                label_tensor,
-                logit_tensor};
-        LocalTrainingBacking local_backing(allocator,
-                                           cg_builder.computation_graph,
-                                           tensor_backing_map,
-                                           runtime_arg_config,
-                                           model_training_instance,
-                                           optimizer_attrs);
-        local_backing.execute_init();
-        local_backing.execute_forward();
-        local_backing.execute_backward();
+        LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{
+            LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}};
+        local_backing.compute_loss(loss_attrs, logit_tensor, label_tensor);
       }
 
       SUBCASE("LossFunction::IDENTITY") {
-        std::optional<ModelTrainingInstance> model_training_instance =
-            ModelTrainingInstance{
-                LossAttrs{NonconfigurableLossAttrs{LossFunction::IDENTITY}},
-                label_tensor,
-                logit_tensor};
-        LocalTrainingBacking local_backing(allocator,
-                                           cg_builder.computation_graph,
-                                           tensor_backing_map,
-                                           runtime_arg_config,
-                                           model_training_instance,
-                                           optimizer_attrs);
-        local_backing.execute_init();
-        local_backing.execute_forward();
-        local_backing.execute_backward();
+        LossAttrs loss_attrs =
+            LossAttrs{NonconfigurableLossAttrs{LossFunction::IDENTITY}};
+        local_backing.compute_loss(loss_attrs, logit_tensor, label_tensor);
       }
     }
   }
diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc
index 96b748806f..b48214d89d 100644
--- a/lib/local-execution/test/src/test_update_e2e.cc
+++ b/lib/local-execution/test/src/test_update_e2e.cc
@@ -3,6 +3,7 @@
 #include "kernels/managed_ff_stream.h"
 #include "kernels/managed_per_device_ff_handle.h"
 #include "local-execution/local_training_backing.h"
+#include "pcg/computation_graph.h"
 #include "pcg/computation_graph_builder.h"
 #include "pcg/optimizer_attrs.dtg.h"
 #include "test_utils.h"
@@ -30,8 +31,9 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         cg_builder.create_input(input_shape, CreateGrad::YES);
 
     float scalar = 4.0;
+    std::string layer_name = "scalar_multiply";
     tensor_guid_t logit_tensor =
-        cg_builder.scalar_multiply(input_tensor, scalar);
+        cg_builder.scalar_multiply(input_tensor, scalar, layer_name);
 
     // allocate memory
     Allocator allocator = create_local_cuda_memory_allocator();
@@ -40,11 +42,17 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         allocator.allocate_tensor(input_shape);
     tensor_backing_map.insert({input_tensor, input_backing});
 
-    tensor_guid_t label_tensor =
-        cg_builder.create_input(input_shape, CreateGrad::NO);
-    GenericTensorAccessorW label_backing =
-        allocator.allocate_tensor(input_shape);
-    tensor_backing_map.insert({label_tensor, label_backing});
+    LocalTrainingBacking local_backing(allocator,
+                                       cg_builder.computation_graph,
+                                       tensor_backing_map,
+                                       runtime_arg_config);
+    // for (layer_guid_t const & node:
+    // topological_ordering(cg_builder.computation_graph)) {
+    //   local_backing.register_and_allocate_layer(node);
+    // }
+    layer_guid_t layer_guid =
+        get_layer_by_name(cg_builder.computation_graph, layer_name);
+    local_backing.register_and_allocate_layer(layer_guid);
 
     SUBCASE("SGDOptimizerAttrs") {
       SUBCASE("momentum=0") {
@@ -53,22 +61,9 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
                                              /*momentum=*/0.0f,
                                              /*nesterov=*/false,
                                              /*weight_decay=*/0.001}};
-        std::optional<ModelTrainingInstance> model_training_instance =
-            ModelTrainingInstance{
-                LossAttrs{NonconfigurableLossAttrs{
-                    LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}},
-                label_tensor,
-                logit_tensor};
-        LocalTrainingBacking local_backing(allocator,
-                                           cg_builder.computation_graph,
-                                           tensor_backing_map,
-                                           runtime_arg_config,
-                                           model_training_instance,
-                                           optimizer_attrs);
-        local_backing.execute_init();
-        local_backing.execute_forward();
-        local_backing.execute_backward();
-        local_backing.execute_update();
+        local_backing.allocate_layer_optimizer_tensors(layer_guid,
+                                                       optimizer_attrs);
+        local_backing.execute_update(layer_guid, optimizer_attrs);
       }
       SUBCASE("momentum=0.9") {
         OptimizerAttrs optimizer_attrs =
@@ -76,22 +71,9 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
                                              /*momentum=*/0.9,
                                              /*nesterov=*/false,
                                              /*weight_decay=*/0.001}};
-        std::optional<ModelTrainingInstance> model_training_instance =
-            ModelTrainingInstance{
-                LossAttrs{NonconfigurableLossAttrs{
-                    LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}},
-                label_tensor,
-                logit_tensor};
-        LocalTrainingBacking local_backing(allocator,
-                                           cg_builder.computation_graph,
-                                           tensor_backing_map,
-                                           runtime_arg_config,
-                                           model_training_instance,
-                                           optimizer_attrs);
-        local_backing.execute_init();
-        local_backing.execute_forward();
-        local_backing.execute_backward();
-        local_backing.execute_update();
+        local_backing.allocate_layer_optimizer_tensors(layer_guid,
+                                                       optimizer_attrs);
+        local_backing.execute_update(layer_guid, optimizer_attrs);
       }
     }
     SUBCASE("AdamOptimizerAttrs") {
@@ -104,22 +86,9 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
                                             /*beta_t=*/0.9,
                                             /*beta2_t=*/0.999,
                                             /*epsilon=*/1e-8}};
-      std::optional<ModelTrainingInstance> model_training_instance =
-          ModelTrainingInstance{
-              LossAttrs{NonconfigurableLossAttrs{
-                  LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}},
-              label_tensor,
-              logit_tensor};
-      LocalTrainingBacking local_backing(allocator,
-                                         cg_builder.computation_graph,
-                                         tensor_backing_map,
-                                         runtime_arg_config,
-                                         model_training_instance,
-                                         optimizer_attrs);
-      local_backing.execute_init();
-      local_backing.execute_forward();
-      local_backing.execute_backward();
-      local_backing.execute_update();
+      local_backing.allocate_layer_optimizer_tensors(layer_guid,
+                                                     optimizer_attrs);
+      local_backing.execute_update(layer_guid, optimizer_attrs);
     }
   }
 }
diff --git a/lib/pcg/include/pcg/computation_graph_builder.h b/lib/pcg/include/pcg/computation_graph_builder.h
index 45cde0de57..585399ea1d 100644
--- a/lib/pcg/include/pcg/computation_graph_builder.h
+++ b/lib/pcg/include/pcg/computation_graph_builder.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_PCG_INCLUDE_PCG_COMPUTATION_GRAPH_BUILDER_H
 
 #include "pcg/computation_graph.dtg.h"
+#include "pcg/computation_graph/layer_added_result.dtg.h"
 #include "pcg/initializer_attrs.dtg.h"
 #include "pcg/tensor_guid_t.dtg.h"
 
@@ -256,6 +257,12 @@ struct ComputationGraphBuilder {
   std::vector<tensor_guid_t> get_outputs(LayerAttrs const &) const;
   tensor_guid_t get_output(LayerAttrs const &, int idx) const;
 
+  LayerAddedResult add_layer_and_get_layer_added_result(
+      LayerAttrs const &layer,
+      std::vector<tensor_guid_t> const &inputs,
+      std::vector<tensor_guid_t> const &weights,
+      std::vector<TensorAttrs> const &outputs);
+
   std::vector<tensor_guid_t>
       add_layer(LayerAttrs const &layer,
                 std::vector<tensor_guid_t> const &inputs,
diff --git a/lib/pcg/src/pcg/computation_graph_builder.cc b/lib/pcg/src/pcg/computation_graph_builder.cc
index 4a565476bd..4c619288cb 100644
--- a/lib/pcg/src/pcg/computation_graph_builder.cc
+++ b/lib/pcg/src/pcg/computation_graph_builder.cc
@@ -106,7 +106,7 @@ static void check_incoming_tensor_roles(LayerAttrs const &layer,
   }
 }
 
-std::vector<tensor_guid_t> ComputationGraphBuilder::add_layer(
+LayerAddedResult ComputationGraphBuilder::add_layer_and_get_layer_added_result(
     LayerAttrs const &layer,
     std::vector<tensor_guid_t> const &inputs,
     std::vector<tensor_guid_t> const &weights,
@@ -115,7 +115,17 @@ std::vector<tensor_guid_t> ComputationGraphBuilder::add_layer(
 
   LayerAddedResult added = ::FlexFlow::add_layer(
       this->computation_graph, layer, concat_vectors(inputs, weights), outputs);
-  return added.outputs;
+  return added;
+}
+
+std::vector<tensor_guid_t> ComputationGraphBuilder::add_layer(
+    LayerAttrs const &layer,
+    std::vector<tensor_guid_t> const &inputs,
+    std::vector<tensor_guid_t> const &weights,
+    std::vector<TensorAttrs> const &outputs) {
+  return this
+      ->add_layer_and_get_layer_added_result(layer, inputs, weights, outputs)
+      .outputs;
 }
 
 tensor_guid_t ComputationGraphBuilder::as_type(tensor_guid_t const &x,

From a73b1c325f819f1ffdcdc0ce38fda1e25fd2eb28 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Wed, 13 Nov 2024 09:22:43 -0800
Subject: [PATCH 21/91] Expose op folders publicly

---
 .../local-execution/model_training_instance.h |  2 -
 .../local-execution}/ops/attention.h          |  0
 .../local-execution}/ops/batch_matmul.h       |  0
 .../local-execution}/ops/batch_norm.h         |  0
 .../local-execution}/ops/cast.h               |  0
 .../local-execution}/ops/combine.h            |  0
 .../local-execution}/ops/concat.h             |  0
 .../local-execution}/ops/conv_2d.h            |  0
 .../local-execution}/ops/dropout.h            |  0
 .../local-execution}/ops/element_binary.h     |  0
 .../local-execution}/ops/element_unary.h      |  0
 .../local-execution}/ops/embedding.h          |  0
 .../local-execution}/ops/flat.h               |  0
 .../local-execution}/ops/gather.h             |  0
 .../local-execution}/ops/input.h              |  0
 .../local-execution}/ops/layer_norm.h         |  0
 .../local-execution}/ops/linear.h             |  0
 .../local-execution}/ops/noop.h               |  0
 .../local-execution}/ops/parallel_op.h        |  0
 .../local-execution}/ops/pool_2d.h            |  0
 .../local-execution}/ops/reduce.h             |  0
 .../local-execution}/ops/reduction.h          |  0
 .../local-execution}/ops/repartition.h        |  0
 .../local-execution}/ops/replicate.h          |  0
 .../local-execution}/ops/reshape.h            |  0
 .../local-execution}/ops/reverse.h            |  0
 .../local-execution}/ops/softmax.h            |  0
 .../local-execution}/ops/split.h              |  0
 .../local-execution}/ops/topk.h               |  0
 .../local-execution}/ops/transpose.h          |  0
 .../local-execution}/ops/weight.h             |  0
 .../src/model_training_instance.cc            | 11 +---
 lib/local-execution/src/ops/attention.cc      |  2 +-
 lib/local-execution/src/ops/batch_matmul.cc   |  2 +-
 lib/local-execution/src/ops/batch_norm.cc     |  2 +-
 lib/local-execution/src/ops/cast.cc           |  2 +-
 lib/local-execution/src/ops/combine.cc        |  2 +-
 lib/local-execution/src/ops/concat.cc         |  2 +-
 lib/local-execution/src/ops/conv_2d.cc        |  2 +-
 lib/local-execution/src/ops/dropout.cc        |  2 +-
 lib/local-execution/src/ops/element_binary.cc |  2 +-
 lib/local-execution/src/ops/element_unary.cc  |  2 +-
 lib/local-execution/src/ops/flat.cc           |  2 +-
 lib/local-execution/src/ops/gather.cc         |  2 +-
 lib/local-execution/src/ops/input.cc          |  2 +-
 lib/local-execution/src/ops/layer_norm.cc     |  2 +-
 lib/local-execution/src/ops/linear.cc         |  2 +-
 lib/local-execution/src/ops/noop.cc           |  2 +-
 lib/local-execution/src/ops/pool_2d.cc        |  2 +-
 lib/local-execution/src/ops/reduce.cc         |  2 +-
 lib/local-execution/src/ops/reduction.cc      |  2 +-
 lib/local-execution/src/ops/repartition.cc    |  2 +-
 lib/local-execution/src/ops/replicate.cc      |  2 +-
 lib/local-execution/src/ops/reshape.cc        |  2 +-
 lib/local-execution/src/ops/reverse.cc        |  2 +-
 lib/local-execution/src/ops/softmax.cc        |  2 +-
 lib/local-execution/src/ops/split.cc          |  2 +-
 lib/local-execution/src/ops/topk.cc           |  2 +-
 lib/local-execution/src/ops/transpose.cc      |  2 +-
 lib/local-execution/src/ops/weight.cc         |  2 +-
 .../src/task_signature_impl.cc                | 58 +++++++++----------
 .../include/op-attrs/operator_attrs.h         | 58 +++++++++----------
 62 files changed, 89 insertions(+), 96 deletions(-)
 rename lib/local-execution/{src => include/local-execution}/ops/attention.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/batch_matmul.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/batch_norm.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/cast.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/combine.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/concat.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/conv_2d.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/dropout.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/element_binary.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/element_unary.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/embedding.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/flat.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/gather.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/input.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/layer_norm.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/linear.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/noop.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/parallel_op.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/pool_2d.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/reduce.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/reduction.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/repartition.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/replicate.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/reshape.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/reverse.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/softmax.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/split.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/topk.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/transpose.h (100%)
 rename lib/local-execution/{src => include/local-execution}/ops/weight.h (100%)

diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h
index 08f373a16f..14473ff26e 100644
--- a/lib/local-execution/include/local-execution/model_training_instance.h
+++ b/lib/local-execution/include/local-execution/model_training_instance.h
@@ -19,8 +19,6 @@ struct ModelTrainingInstance {
                         tensor_guid_t const & label_tensor,
                         OptimizerAttrs const &);
 
-  void register_and_allocate_layers();
-  void allocate_optimizer_tensors();
   void execute_init();
   PerLayerElapsedTime execute_forward();
   PerLayerElapsedTime execute_backward();
diff --git a/lib/local-execution/src/ops/attention.h b/lib/local-execution/include/local-execution/ops/attention.h
similarity index 100%
rename from lib/local-execution/src/ops/attention.h
rename to lib/local-execution/include/local-execution/ops/attention.h
diff --git a/lib/local-execution/src/ops/batch_matmul.h b/lib/local-execution/include/local-execution/ops/batch_matmul.h
similarity index 100%
rename from lib/local-execution/src/ops/batch_matmul.h
rename to lib/local-execution/include/local-execution/ops/batch_matmul.h
diff --git a/lib/local-execution/src/ops/batch_norm.h b/lib/local-execution/include/local-execution/ops/batch_norm.h
similarity index 100%
rename from lib/local-execution/src/ops/batch_norm.h
rename to lib/local-execution/include/local-execution/ops/batch_norm.h
diff --git a/lib/local-execution/src/ops/cast.h b/lib/local-execution/include/local-execution/ops/cast.h
similarity index 100%
rename from lib/local-execution/src/ops/cast.h
rename to lib/local-execution/include/local-execution/ops/cast.h
diff --git a/lib/local-execution/src/ops/combine.h b/lib/local-execution/include/local-execution/ops/combine.h
similarity index 100%
rename from lib/local-execution/src/ops/combine.h
rename to lib/local-execution/include/local-execution/ops/combine.h
diff --git a/lib/local-execution/src/ops/concat.h b/lib/local-execution/include/local-execution/ops/concat.h
similarity index 100%
rename from lib/local-execution/src/ops/concat.h
rename to lib/local-execution/include/local-execution/ops/concat.h
diff --git a/lib/local-execution/src/ops/conv_2d.h b/lib/local-execution/include/local-execution/ops/conv_2d.h
similarity index 100%
rename from lib/local-execution/src/ops/conv_2d.h
rename to lib/local-execution/include/local-execution/ops/conv_2d.h
diff --git a/lib/local-execution/src/ops/dropout.h b/lib/local-execution/include/local-execution/ops/dropout.h
similarity index 100%
rename from lib/local-execution/src/ops/dropout.h
rename to lib/local-execution/include/local-execution/ops/dropout.h
diff --git a/lib/local-execution/src/ops/element_binary.h b/lib/local-execution/include/local-execution/ops/element_binary.h
similarity index 100%
rename from lib/local-execution/src/ops/element_binary.h
rename to lib/local-execution/include/local-execution/ops/element_binary.h
diff --git a/lib/local-execution/src/ops/element_unary.h b/lib/local-execution/include/local-execution/ops/element_unary.h
similarity index 100%
rename from lib/local-execution/src/ops/element_unary.h
rename to lib/local-execution/include/local-execution/ops/element_unary.h
diff --git a/lib/local-execution/src/ops/embedding.h b/lib/local-execution/include/local-execution/ops/embedding.h
similarity index 100%
rename from lib/local-execution/src/ops/embedding.h
rename to lib/local-execution/include/local-execution/ops/embedding.h
diff --git a/lib/local-execution/src/ops/flat.h b/lib/local-execution/include/local-execution/ops/flat.h
similarity index 100%
rename from lib/local-execution/src/ops/flat.h
rename to lib/local-execution/include/local-execution/ops/flat.h
diff --git a/lib/local-execution/src/ops/gather.h b/lib/local-execution/include/local-execution/ops/gather.h
similarity index 100%
rename from lib/local-execution/src/ops/gather.h
rename to lib/local-execution/include/local-execution/ops/gather.h
diff --git a/lib/local-execution/src/ops/input.h b/lib/local-execution/include/local-execution/ops/input.h
similarity index 100%
rename from lib/local-execution/src/ops/input.h
rename to lib/local-execution/include/local-execution/ops/input.h
diff --git a/lib/local-execution/src/ops/layer_norm.h b/lib/local-execution/include/local-execution/ops/layer_norm.h
similarity index 100%
rename from lib/local-execution/src/ops/layer_norm.h
rename to lib/local-execution/include/local-execution/ops/layer_norm.h
diff --git a/lib/local-execution/src/ops/linear.h b/lib/local-execution/include/local-execution/ops/linear.h
similarity index 100%
rename from lib/local-execution/src/ops/linear.h
rename to lib/local-execution/include/local-execution/ops/linear.h
diff --git a/lib/local-execution/src/ops/noop.h b/lib/local-execution/include/local-execution/ops/noop.h
similarity index 100%
rename from lib/local-execution/src/ops/noop.h
rename to lib/local-execution/include/local-execution/ops/noop.h
diff --git a/lib/local-execution/src/ops/parallel_op.h b/lib/local-execution/include/local-execution/ops/parallel_op.h
similarity index 100%
rename from lib/local-execution/src/ops/parallel_op.h
rename to lib/local-execution/include/local-execution/ops/parallel_op.h
diff --git a/lib/local-execution/src/ops/pool_2d.h b/lib/local-execution/include/local-execution/ops/pool_2d.h
similarity index 100%
rename from lib/local-execution/src/ops/pool_2d.h
rename to lib/local-execution/include/local-execution/ops/pool_2d.h
diff --git a/lib/local-execution/src/ops/reduce.h b/lib/local-execution/include/local-execution/ops/reduce.h
similarity index 100%
rename from lib/local-execution/src/ops/reduce.h
rename to lib/local-execution/include/local-execution/ops/reduce.h
diff --git a/lib/local-execution/src/ops/reduction.h b/lib/local-execution/include/local-execution/ops/reduction.h
similarity index 100%
rename from lib/local-execution/src/ops/reduction.h
rename to lib/local-execution/include/local-execution/ops/reduction.h
diff --git a/lib/local-execution/src/ops/repartition.h b/lib/local-execution/include/local-execution/ops/repartition.h
similarity index 100%
rename from lib/local-execution/src/ops/repartition.h
rename to lib/local-execution/include/local-execution/ops/repartition.h
diff --git a/lib/local-execution/src/ops/replicate.h b/lib/local-execution/include/local-execution/ops/replicate.h
similarity index 100%
rename from lib/local-execution/src/ops/replicate.h
rename to lib/local-execution/include/local-execution/ops/replicate.h
diff --git a/lib/local-execution/src/ops/reshape.h b/lib/local-execution/include/local-execution/ops/reshape.h
similarity index 100%
rename from lib/local-execution/src/ops/reshape.h
rename to lib/local-execution/include/local-execution/ops/reshape.h
diff --git a/lib/local-execution/src/ops/reverse.h b/lib/local-execution/include/local-execution/ops/reverse.h
similarity index 100%
rename from lib/local-execution/src/ops/reverse.h
rename to lib/local-execution/include/local-execution/ops/reverse.h
diff --git a/lib/local-execution/src/ops/softmax.h b/lib/local-execution/include/local-execution/ops/softmax.h
similarity index 100%
rename from lib/local-execution/src/ops/softmax.h
rename to lib/local-execution/include/local-execution/ops/softmax.h
diff --git a/lib/local-execution/src/ops/split.h b/lib/local-execution/include/local-execution/ops/split.h
similarity index 100%
rename from lib/local-execution/src/ops/split.h
rename to lib/local-execution/include/local-execution/ops/split.h
diff --git a/lib/local-execution/src/ops/topk.h b/lib/local-execution/include/local-execution/ops/topk.h
similarity index 100%
rename from lib/local-execution/src/ops/topk.h
rename to lib/local-execution/include/local-execution/ops/topk.h
diff --git a/lib/local-execution/src/ops/transpose.h b/lib/local-execution/include/local-execution/ops/transpose.h
similarity index 100%
rename from lib/local-execution/src/ops/transpose.h
rename to lib/local-execution/include/local-execution/ops/transpose.h
diff --git a/lib/local-execution/src/ops/weight.h b/lib/local-execution/include/local-execution/ops/weight.h
similarity index 100%
rename from lib/local-execution/src/ops/weight.h
rename to lib/local-execution/include/local-execution/ops/weight.h
diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc
index 7256a82478..abdced1bb5 100644
--- a/lib/local-execution/src/model_training_instance.cc
+++ b/lib/local-execution/src/model_training_instance.cc
@@ -14,16 +14,11 @@ ModelTrainingInstance::ModelTrainingInstance(Allocator const & allocator,
                                              tensor_guid_t const &label_tensor, 
                                              OptimizerAttrs const & optimizer_attrs) 
   : computation_graph(computation_graph), training_backing(allocator, computation_graph, tensor_backing_map, runtime_arg_config),
-  loss_attrs(loss_attrs), logit_tensor(logit_tensor), label_tensor(label_tensor), optimizer_attrs(optimizer_attrs) {}
+  loss_attrs(loss_attrs), logit_tensor(logit_tensor), label_tensor(label_tensor), optimizer_attrs(optimizer_attrs) {
 
-void ModelTrainingInstance::register_and_allocate_layers() {
+  // allocate each layer's tensors
   for (layer_guid_t const & node: topological_ordering(this->computation_graph)) {
     this->training_backing.register_and_allocate_layer(node);
-  }
-}
-
-void ModelTrainingInstance::allocate_optimizer_tensors() {
-  for (layer_guid_t const & node: topological_ordering(this->computation_graph)) {
     this->training_backing.allocate_layer_optimizer_tensors(node, this->optimizer_attrs);
   }
 }
@@ -61,4 +56,4 @@ void ModelTrainingInstance::execute_update() {
   this->optimizer_attrs = get_next_iteration_optimizer_attrs(this->optimizer_attrs);
 }
 
-}
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/ops/attention.cc b/lib/local-execution/src/ops/attention.cc
index 5e693d43db..b4c5d1ff8a 100644
--- a/lib/local-execution/src/ops/attention.cc
+++ b/lib/local-execution/src/ops/attention.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "attention.h"
+#include "local-execution/ops/attention.h"
 #include "kernels/attention_kernels.h"
 #include "local-execution/op_task_signature.h"
 #include "op-attrs/ops/attention.h"
diff --git a/lib/local-execution/src/ops/batch_matmul.cc b/lib/local-execution/src/ops/batch_matmul.cc
index d60a003061..e358e0a645 100644
--- a/lib/local-execution/src/ops/batch_matmul.cc
+++ b/lib/local-execution/src/ops/batch_matmul.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "batch_matmul.h"
+#include "local-execution/ops/batch_matmul.h"
 #include "kernels/batch_matmul_kernels.h"
 #include "local-execution/op_task_signature.h"
 #include "op-attrs/get_output_shapes.h"
diff --git a/lib/local-execution/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc
index 254d7ef39e..62155aa161 100644
--- a/lib/local-execution/src/ops/batch_norm.cc
+++ b/lib/local-execution/src/ops/batch_norm.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "batch_norm.h"
+#include "local-execution/ops/batch_norm.h"
 #include "kernels/batch_norm_kernels.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc
index d3e43a46a0..846faa9262 100644
--- a/lib/local-execution/src/ops/cast.cc
+++ b/lib/local-execution/src/ops/cast.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "cast.h"
+#include "local-execution/ops/cast.h"
 #include "kernels/cast_kernels.h"
 
 #include "local-execution/op_task_signature.h"
diff --git a/lib/local-execution/src/ops/combine.cc b/lib/local-execution/src/ops/combine.cc
index 92f2931344..b7e84878f4 100644
--- a/lib/local-execution/src/ops/combine.cc
+++ b/lib/local-execution/src/ops/combine.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "combine.h"
+#include "local-execution/ops/combine.h"
 #include "kernels/combine_kernels.h"
 #include "local-execution/op_task_invocation.h"
 #include "utils/hash-utils.h"
diff --git a/lib/local-execution/src/ops/concat.cc b/lib/local-execution/src/ops/concat.cc
index 42d98c336a..dee1dd08e5 100644
--- a/lib/local-execution/src/ops/concat.cc
+++ b/lib/local-execution/src/ops/concat.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "concat.h"
+#include "local-execution/ops/concat.h"
 #include "kernels/concat_kernels.h"
 
 #include "local-execution/op_task_signature.h"
diff --git a/lib/local-execution/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc
index 7694a03947..7ae92d70c7 100644
--- a/lib/local-execution/src/ops/conv_2d.cc
+++ b/lib/local-execution/src/ops/conv_2d.cc
@@ -1,4 +1,4 @@
-#include "conv_2d.h"
+#include "local-execution/ops/conv_2d.h"
 #include "kernels/conv_2d_kernels.h"
 #include "op-attrs/get_output_shapes.h"
 
diff --git a/lib/local-execution/src/ops/dropout.cc b/lib/local-execution/src/ops/dropout.cc
index 77a2963313..017d023ec4 100644
--- a/lib/local-execution/src/ops/dropout.cc
+++ b/lib/local-execution/src/ops/dropout.cc
@@ -1,4 +1,4 @@
-#include "dropout.h"
+#include "local-execution/ops/dropout.h"
 #include "kernels/dropout_kernels.h"
 #include "local-execution/op_task_invocation.h"
 #include "local-execution/op_task_signature.h"
diff --git a/lib/local-execution/src/ops/element_binary.cc b/lib/local-execution/src/ops/element_binary.cc
index 2152b1beea..d4c12c7285 100644
--- a/lib/local-execution/src/ops/element_binary.cc
+++ b/lib/local-execution/src/ops/element_binary.cc
@@ -1,4 +1,4 @@
-#include "element_binary.h"
+#include "local-execution/ops/element_binary.h"
 #include "kernels/element_binary_kernels.h"
 #include "local-execution/task_signature_impl.h"
 #include "op-attrs/get_output_shapes.h"
diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc
index ccb41d7461..85ecf3db23 100644
--- a/lib/local-execution/src/ops/element_unary.cc
+++ b/lib/local-execution/src/ops/element_unary.cc
@@ -1,4 +1,4 @@
-#include "element_unary.h"
+#include "local-execution/ops/element_unary.h"
 #include "kernels/element_unary_kernels.h"
 #include "op-attrs/get_output_shapes.h"
 #include "op-attrs/parallel_tensor_shape.h"
diff --git a/lib/local-execution/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc
index 8df5703f60..ef4dc7ab68 100644
--- a/lib/local-execution/src/ops/flat.cc
+++ b/lib/local-execution/src/ops/flat.cc
@@ -1,4 +1,4 @@
-#include "flat.h"
+#include "local-execution/ops/flat.h"
 #include "kernels/flat_kernels.h"
 #include "op-attrs/get_output_shapes.h"
 
diff --git a/lib/local-execution/src/ops/gather.cc b/lib/local-execution/src/ops/gather.cc
index 558988f9a4..180026e9ba 100644
--- a/lib/local-execution/src/ops/gather.cc
+++ b/lib/local-execution/src/ops/gather.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "gather.h"
+#include "local-execution/ops/gather.h"
 #include "kernels/gather_kernels.h"
 #include "local-execution/legion_tensor_shape.h"
 #include "op-attrs/get_output_shapes.h"
diff --git a/lib/local-execution/src/ops/input.cc b/lib/local-execution/src/ops/input.cc
index 56d19fa1ba..d7a3888220 100644
--- a/lib/local-execution/src/ops/input.cc
+++ b/lib/local-execution/src/ops/input.cc
@@ -1,4 +1,4 @@
-#include "input.h"
+#include "local-execution/ops/input.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/layer_norm.cc b/lib/local-execution/src/ops/layer_norm.cc
index b1f44d69ae..c9e2a8d55e 100644
--- a/lib/local-execution/src/ops/layer_norm.cc
+++ b/lib/local-execution/src/ops/layer_norm.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "layer_norm.h"
+#include "local-execution/ops/layer_norm.h"
 #include "kernels/layer_norm_kernels.h"
 #include "local-execution/legion_tensor_shape.h"
 #include "op-attrs/get_output_shapes.h"
diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc
index 9e29a0cce0..075aa1d9e4 100644
--- a/lib/local-execution/src/ops/linear.cc
+++ b/lib/local-execution/src/ops/linear.cc
@@ -1,4 +1,4 @@
-#include "linear.h"
+#include "local-execution/ops/linear.h"
 #include "kernels/linear_kernels.h"
 #include "local-execution/task_argument_accessor.h"
 #include "op-attrs/ff_dim.h"
diff --git a/lib/local-execution/src/ops/noop.cc b/lib/local-execution/src/ops/noop.cc
index e35fdec275..7357806880 100644
--- a/lib/local-execution/src/ops/noop.cc
+++ b/lib/local-execution/src/ops/noop.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "noop.h"
+#include "local-execution/ops/noop.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc
index 126f57be0d..66f27fa69f 100644
--- a/lib/local-execution/src/ops/pool_2d.cc
+++ b/lib/local-execution/src/ops/pool_2d.cc
@@ -1,4 +1,4 @@
-#include "pool_2d.h"
+#include "local-execution/ops/pool_2d.h"
 #include "kernels/pool_2d_kernels.h"
 
 #include "op-attrs/get_output_shapes.h"
diff --git a/lib/local-execution/src/ops/reduce.cc b/lib/local-execution/src/ops/reduce.cc
index 01d2f0e86f..c157a98b36 100644
--- a/lib/local-execution/src/ops/reduce.cc
+++ b/lib/local-execution/src/ops/reduce.cc
@@ -1,4 +1,4 @@
-#include "reduce.h"
+#include "local-execution/ops/reduce.h"
 #include "kernels/reduce_kernels.h"
 
 #include "op-attrs/get_output_shapes.h"
diff --git a/lib/local-execution/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc
index f946b7d146..95962661e2 100644
--- a/lib/local-execution/src/ops/reduction.cc
+++ b/lib/local-execution/src/ops/reduction.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "reduction.h"
+#include "local-execution/ops/reduction.h"
 #include "kernels/reduction_kernels.h"
 #include "op-attrs/get_output_shapes.h"
 #include "utils/exception.h"
diff --git a/lib/local-execution/src/ops/repartition.cc b/lib/local-execution/src/ops/repartition.cc
index e260fd77f5..9bba8109f3 100644
--- a/lib/local-execution/src/ops/repartition.cc
+++ b/lib/local-execution/src/ops/repartition.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "repartition.h"
+#include "local-execution/ops/repartition.h"
 #include "kernels/partition_kernels.h"
 #include "op-attrs/get_output_shapes.h"
 #include "utils/exception.h"
diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc
index 10cd80a6d9..5ae93c4439 100644
--- a/lib/local-execution/src/ops/replicate.cc
+++ b/lib/local-execution/src/ops/replicate.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "replicate.h"
+#include "local-execution/ops/replicate.h"
 #include "kernels/replicate_kernels.h"
 #include "op-attrs/get_output_shapes.h"
 #include "op-attrs/parallel_tensor_shape.h"
diff --git a/lib/local-execution/src/ops/reshape.cc b/lib/local-execution/src/ops/reshape.cc
index 433e961a8a..838542a8eb 100644
--- a/lib/local-execution/src/ops/reshape.cc
+++ b/lib/local-execution/src/ops/reshape.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "reshape.h"
+#include "local-execution/ops/reshape.h"
 #include "kernels/reshape_kernels.h"
 #include "op-attrs/get_output_shapes.h"
 
diff --git a/lib/local-execution/src/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc
index b767b61b20..63032585b8 100644
--- a/lib/local-execution/src/ops/reverse.cc
+++ b/lib/local-execution/src/ops/reverse.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "reverse.h"
+#include "local-execution/ops/reverse.h"
 #include "kernels/accessor.h"
 #include "kernels/reverse_kernels.h"
 #include "op-attrs/get_output_shapes.h"
diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc
index 36c4afcaf3..5e78781ddc 100644
--- a/lib/local-execution/src/ops/softmax.cc
+++ b/lib/local-execution/src/ops/softmax.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "softmax.h"
+#include "local-execution/ops/softmax.h"
 #include "kernels/softmax_kernels.h"
 #include "op-attrs/get_output_shapes.h"
 #include "op-attrs/parallel_tensor_shape.h"
diff --git a/lib/local-execution/src/ops/split.cc b/lib/local-execution/src/ops/split.cc
index dc627aae96..556d30109b 100644
--- a/lib/local-execution/src/ops/split.cc
+++ b/lib/local-execution/src/ops/split.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "split.h"
+#include "local-execution/ops/split.h"
 #include "kernels/array_shape.h"
 #include "kernels/split_kernels.h"
 #include "op-attrs/get_output_shapes.h"
diff --git a/lib/local-execution/src/ops/topk.cc b/lib/local-execution/src/ops/topk.cc
index ea4fc09e19..41a28340db 100644
--- a/lib/local-execution/src/ops/topk.cc
+++ b/lib/local-execution/src/ops/topk.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "topk.h"
+#include "local-execution/ops/topk.h"
 #include "kernels/topk_kernels.h"
 #include "op-attrs/get_output_shapes.h"
 #include "utils/exception.h"
diff --git a/lib/local-execution/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc
index 435df464c0..78e9fbde6f 100644
--- a/lib/local-execution/src/ops/transpose.cc
+++ b/lib/local-execution/src/ops/transpose.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "transpose.h"
+#include "local-execution/ops/transpose.h"
 #include "kernels/transpose_kernels.h"
 #include "op-attrs/get_output_shapes.h"
 #include "op-attrs/ops/transpose.h"
diff --git a/lib/local-execution/src/ops/weight.cc b/lib/local-execution/src/ops/weight.cc
index 5537163e85..f96c104f33 100644
--- a/lib/local-execution/src/ops/weight.cc
+++ b/lib/local-execution/src/ops/weight.cc
@@ -1,4 +1,4 @@
-#include "weight.h"
+#include "local-execution/ops/weight.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/task_signature_impl.cc b/lib/local-execution/src/task_signature_impl.cc
index 3072b9a8bd..199e232a6b 100644
--- a/lib/local-execution/src/task_signature_impl.cc
+++ b/lib/local-execution/src/task_signature_impl.cc
@@ -1,33 +1,33 @@
 #include "local-execution/task_signature_impl.h"
-#include "ops/attention.h"
-#include "ops/batch_matmul.h"
-#include "ops/batch_norm.h"
-#include "ops/cast.h"
-#include "ops/combine.h"
-#include "ops/concat.h"
-#include "ops/conv_2d.h"
-#include "ops/dropout.h"
-#include "ops/element_binary.h"
-#include "ops/element_unary.h"
-#include "ops/embedding.h"
-#include "ops/flat.h"
-#include "ops/gather.h"
-#include "ops/input.h"
-#include "ops/layer_norm.h"
-#include "ops/linear.h"
-#include "ops/noop.h"
-#include "ops/pool_2d.h"
-#include "ops/reduce.h"
-#include "ops/reduction.h"
-#include "ops/repartition.h"
-#include "ops/replicate.h"
-#include "ops/reshape.h"
-#include "ops/reverse.h"
-#include "ops/softmax.h"
-#include "ops/split.h"
-#include "ops/topk.h"
-#include "ops/transpose.h"
-#include "ops/weight.h"
+#include "local-execution/ops/attention.h"
+#include "local-execution/ops/batch_matmul.h"
+#include "local-execution/ops/batch_norm.h"
+#include "local-execution/ops/cast.h"
+#include "local-execution/ops/combine.h"
+#include "local-execution/ops/concat.h"
+#include "local-execution/ops/conv_2d.h"
+#include "local-execution/ops/dropout.h"
+#include "local-execution/ops/element_binary.h"
+#include "local-execution/ops/element_unary.h"
+#include "local-execution/ops/embedding.h"
+#include "local-execution/ops/flat.h"
+#include "local-execution/ops/gather.h"
+#include "local-execution/ops/input.h"
+#include "local-execution/ops/layer_norm.h"
+#include "local-execution/ops/linear.h"
+#include "local-execution/ops/noop.h"
+#include "local-execution/ops/pool_2d.h"
+#include "local-execution/ops/reduce.h"
+#include "local-execution/ops/reduction.h"
+#include "local-execution/ops/repartition.h"
+#include "local-execution/ops/replicate.h"
+#include "local-execution/ops/reshape.h"
+#include "local-execution/ops/reverse.h"
+#include "local-execution/ops/softmax.h"
+#include "local-execution/ops/split.h"
+#include "local-execution/ops/topk.h"
+#include "local-execution/ops/transpose.h"
+#include "local-execution/ops/weight.h"
 #include "utils/overload.h"
 
 namespace FlexFlow {
diff --git a/lib/op-attrs/include/op-attrs/operator_attrs.h b/lib/op-attrs/include/op-attrs/operator_attrs.h
index 268554b5be..11afc5b209 100644
--- a/lib/op-attrs/include/op-attrs/operator_attrs.h
+++ b/lib/op-attrs/include/op-attrs/operator_attrs.h
@@ -3,35 +3,35 @@
 
 #include "op-attrs/ops/core.h"
 #include "op-attrs/pcg_operator_attrs.dtg.h"
-#include "ops/attention.h"
-#include "ops/batch_matmul.h"
-#include "ops/batch_norm.h"
-#include "ops/broadcast.h"
-#include "ops/cast.h"
-#include "ops/combine.h"
-#include "ops/concat.h"
-#include "ops/conv_2d.h"
-#include "ops/dropout.h"
-#include "ops/element_binary.h"
-#include "ops/element_unary.h"
-#include "ops/embedding.h"
-#include "ops/flat.h"
-#include "ops/gather.h"
-#include "ops/input.h"
-#include "ops/layer_norm.h"
-#include "ops/linear.h"
-#include "ops/noop.h"
-#include "ops/pool_2d.h"
-#include "ops/reduce.h"
-#include "ops/reduction.h"
-#include "ops/repartition.h"
-#include "ops/replicate.h"
-#include "ops/reshape.h"
-#include "ops/reverse.h"
-#include "ops/softmax.h"
-#include "ops/split.h"
-#include "ops/topk.h"
-#include "ops/transpose.h"
+#include "local-execution/ops/attention.h"
+#include "local-execution/ops/batch_matmul.h"
+#include "local-execution/ops/batch_norm.h"
+#include "local-execution/ops/broadcast.h"
+#include "local-execution/ops/cast.h"
+#include "local-execution/ops/combine.h"
+#include "local-execution/ops/concat.h"
+#include "local-execution/ops/conv_2d.h"
+#include "local-execution/ops/dropout.h"
+#include "local-execution/ops/element_binary.h"
+#include "local-execution/ops/element_unary.h"
+#include "local-execution/ops/embedding.h"
+#include "local-execution/ops/flat.h"
+#include "local-execution/ops/gather.h"
+#include "local-execution/ops/input.h"
+#include "local-execution/ops/layer_norm.h"
+#include "local-execution/ops/linear.h"
+#include "local-execution/ops/noop.h"
+#include "local-execution/ops/pool_2d.h"
+#include "local-execution/ops/reduce.h"
+#include "local-execution/ops/reduction.h"
+#include "local-execution/ops/repartition.h"
+#include "local-execution/ops/replicate.h"
+#include "local-execution/ops/reshape.h"
+#include "local-execution/ops/reverse.h"
+#include "local-execution/ops/softmax.h"
+#include "local-execution/ops/split.h"
+#include "local-execution/ops/topk.h"
+#include "local-execution/ops/transpose.h"
 #include "utils/record_formatter.h"
 #include "utils/variant.h"
 #include <variant>

From c6fed294c5b31001f978123c43681c0db32b3e0b Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Wed, 13 Nov 2024 13:24:19 -0800
Subject: [PATCH 22/91] Add tensor type, operate over reduced tensor

---
 .../local-execution/itask_argument_accessor.h |   8 +-
 .../layer_tensor_key.struct.toml              |  23 +++
 .../local-execution/local_slots_backing.h     |  45 +++---
 .../local_task_argument_accessor.h            |  10 +-
 .../local-execution/local_training_backing.h  |  13 +-
 .../include/local-execution/loss_functions.h  |   2 +-
 .../local-execution/model_training_instance.h |  13 +-
 .../local-execution/op_task_invocation.h      |   6 +-
 .../op_tensor_slot_spec.struct.toml           |   6 +-
 .../include/local-execution/optimizer.h       |  14 +-
 .../reduced_tensor_t.struct.toml              |  13 ++
 ...t.toml => slot_tensor_type_id.struct.toml} |   8 +-
 .../local-execution/task_argument_accessor.h  |  59 +++++++-
 .../include/local-execution/task_binding.h    |  12 +-
 .../include/local-execution/task_signature.h  |   4 +-
 .../task_signature.struct.toml                |   4 +-
 .../tensor_guid_slot_spec.struct.toml         |  22 ---
 .../tensor_guid_spec.struct.toml              |  23 ---
 .../local-execution/tensor_reduction.h        |  15 ++
 .../local-execution/tensor_type.enum.toml     |  20 +++
 .../tensor_type_slot_spec.struct.toml         |  26 ++++
 .../unified_tensor_guid.variant.toml          |  21 ---
 .../src/local_slots_backing.cc                | 137 +++++++++---------
 .../src/local_task_argument_accessor.cc       |  12 +-
 .../src/local_training_backing.cc             |  40 ++---
 lib/local-execution/src/loss_functions.cc     |  19 ++-
 .../src/model_training_instance.cc            |  64 +++++---
 lib/local-execution/src/op_task_invocation.cc |  14 +-
 lib/local-execution/src/op_task_signature.cc  |  42 ++++--
 lib/local-execution/src/optimizer.cc          |  45 +++---
 lib/local-execution/src/task_binding.cc       |  17 ++-
 lib/local-execution/src/task_signature.cc     |  10 +-
 lib/local-execution/src/tensor_reduction.cc   |  17 +++
 .../include/op-attrs/operator_attrs.h         |   4 +-
 34 files changed, 471 insertions(+), 317 deletions(-)
 create mode 100644 lib/local-execution/include/local-execution/layer_tensor_key.struct.toml
 create mode 100644 lib/local-execution/include/local-execution/reduced_tensor_t.struct.toml
 rename lib/local-execution/include/local-execution/{slot_grad_id.struct.toml => slot_tensor_type_id.struct.toml} (62%)
 delete mode 100644 lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml
 delete mode 100644 lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml
 create mode 100644 lib/local-execution/include/local-execution/tensor_reduction.h
 create mode 100644 lib/local-execution/include/local-execution/tensor_type.enum.toml
 create mode 100644 lib/local-execution/include/local-execution/tensor_type_slot_spec.struct.toml
 delete mode 100644 lib/local-execution/include/local-execution/unified_tensor_guid.variant.toml
 create mode 100644 lib/local-execution/src/tensor_reduction.cc

diff --git a/lib/local-execution/include/local-execution/itask_argument_accessor.h b/lib/local-execution/include/local-execution/itask_argument_accessor.h
index b4d188e4a3..9eff9460c2 100644
--- a/lib/local-execution/include/local-execution/itask_argument_accessor.h
+++ b/lib/local-execution/include/local-execution/itask_argument_accessor.h
@@ -5,6 +5,7 @@
 #include "local-execution/concrete_arg.h"
 #include "local-execution/op_task_signature.h"
 #include "local-execution/privilege_tensor_accessor.h"
+#include "local-execution/tensor_type.dtg.h"
 
 namespace FlexFlow {
 
@@ -15,10 +16,11 @@ struct ITaskArgumentAccessor {
 
   virtual ConcreteArgSpec const &get_concrete_arg(slot_id_t) const = 0;
 
-  virtual GenericTensorAccessor
-      get_tensor(slot_id_t slot, Permissions priv, IsGrad is_grad) const = 0;
+  virtual GenericTensorAccessor get_tensor(slot_id_t slot,
+                                           Permissions priv,
+                                           TensorType tensor_type) const = 0;
   virtual VariadicGenericTensorAccessor get_variadic_tensor(
-      slot_id_t slot, Permissions priv, IsGrad is_grad) const = 0;
+      slot_id_t slot, Permissions priv, TensorType tensor_type) const = 0;
 
   virtual Allocator get_allocator() const = 0;
   virtual size_t get_device_idx() const = 0;
diff --git a/lib/local-execution/include/local-execution/layer_tensor_key.struct.toml b/lib/local-execution/include/local-execution/layer_tensor_key.struct.toml
new file mode 100644
index 0000000000..3ec6d7b0f1
--- /dev/null
+++ b/lib/local-execution/include/local-execution/layer_tensor_key.struct.toml
@@ -0,0 +1,23 @@
+namespace = "FlexFlow"
+name = "LayerTensorKey"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "json",
+  "rapidcheck",
+  "fmt",
+]
+
+includes = [
+  "pcg/layer_guid_t.dtg.h",
+  "local-execution/reduced_tensor_t.dtg.h"
+]
+
+[[fields]]
+name = "layer_guid"
+type = "::FlexFlow::layer_guid_t"
+
+[[fields]]
+name = "reduced_tensor"
+type = "::FlexFlow::reduced_tensor_t"
diff --git a/lib/local-execution/include/local-execution/local_slots_backing.h b/lib/local-execution/include/local-execution/local_slots_backing.h
index 46e66e97a2..a632f432cf 100644
--- a/lib/local-execution/include/local-execution/local_slots_backing.h
+++ b/lib/local-execution/include/local-execution/local_slots_backing.h
@@ -3,6 +3,7 @@
 #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_SLOTS_BACKING_H
 
 #include "kernels/accessor.h"
+#include "local-execution/layer_tensor_key.dtg.h"
 #include "local-execution/local_task_argument_accessor.h"
 #include "local-execution/non_graph_tensor_guid_t.dtg.h"
 #include "local-execution/op_task_invocation.h"
@@ -10,26 +11,25 @@
 #include "local-execution/runtime_arg_config.h"
 #include "local-execution/task_invocation.dtg.h"
 #include "local-execution/tensor_role.dtg.h"
-#include "local-execution/unified_tensor_guid.dtg.h"
 #include "pcg/computation_graph.dtg.h"
-#include "pcg/layer_guid_t.dtg.h"
 #include "pcg/tensor_guid_t.dtg.h"
 
 namespace FlexFlow {
 
+using LayerTensorBackingMap =
+    std::unordered_map<LayerTensorKey, GenericTensorAccessorW>;
+
 using TensorBackingMap =
-    std::unordered_map<tensor_guid_t, GenericTensorAccessorW>;
-using NonGraphTensorBackingMap =
-    std::unordered_map<non_graph_tensor_guid_t, GenericTensorAccessorW>;
+    std::unordered_map<reduced_tensor_t, GenericTensorAccessorW>;
 
 struct LocalSlotsBacking {
-  LocalSlotsBacking(TensorBackingMap const &, RuntimeArgConfig const &);
+  LocalSlotsBacking(LayerTensorBackingMap const &allocated_forward_tensors,
+                    TensorBackingMap const &allocated_non_graph_tensors,
+                    RuntimeArgConfig const &);
 
 public:
   void add_per_device_op_state(layer_guid_t const &,
                                DeviceSpecificDeviceStates const &);
-  void insert_into_tensor_mapping(tensor_guid_t const &,
-                                  GenericTensorAccessorW const &);
   void allocate_layer_tensors(layer_guid_t const &,
                               ComputationGraph const &,
                               Allocator &);
@@ -44,7 +44,9 @@ struct LocalSlotsBacking {
                                   TaskSignature const &);
   TensorSlotsBacking construct_tensor_slots_backing(OpTaskBinding const &,
                                                     layer_guid_t const &) const;
-  TensorSlotsBacking construct_tensor_slots_backing(TaskBinding const &) const;
+  TensorSlotsBacking
+      construct_tensor_slots_backing(TaskBinding const &,
+                                     std::optional<layer_guid_t> const &) const;
   ArgSlotsBacking construct_arg_slots_backing(OpTaskBinding const &,
                                               layer_guid_t const &) const;
   ArgSlotsBacking construct_arg_slots_backing(TaskBinding const &) const;
@@ -53,24 +55,27 @@ struct LocalSlotsBacking {
   ConcreteArgSpec resolve_op_arg_ref_spec(OpArgRefSpec const &,
                                           layer_guid_t const &) const;
 
-  GenericTensorAccessorW const &get_tensor_backing(UnifiedTensorGuid const &,
-                                                   IsGrad) const;
+  GenericTensorAccessorW const &
+      get_tensor_backing(TensorType const &,
+                         reduced_tensor_t const &,
+                         std::optional<layer_guid_t> const &) const;
 
-  bool is_tensor_allocated(tensor_guid_t const &) const;
-  bool is_gradient_tensor_allocated(tensor_guid_t const &) const;
+  bool is_forward_tensor_allocated(LayerTensorKey const &) const;
+  bool is_non_graph_tensor_allocated(reduced_tensor_t const &) const;
 
 public:
   // tensors
-  TensorBackingMap tensor_mapping;
-  TensorBackingMap gradient_tensor_mapping;
-  NonGraphTensorBackingMap optimizer_tensor_mapping;
-  std::unordered_map<layer_guid_t, std::vector<tensor_guid_t>>
+  LayerTensorBackingMap tensor_mapping;
+  LayerTensorBackingMap gradient_tensor_mapping;
+  LayerTensorBackingMap optimizer_tensor_mapping;
+  TensorBackingMap non_graph_tensor_mapping;
+  std::unordered_map<layer_guid_t, std::vector<reduced_tensor_t>>
       input_tensor_slots;
-  std::unordered_map<layer_guid_t, std::vector<tensor_guid_t>>
+  std::unordered_map<layer_guid_t, std::vector<reduced_tensor_t>>
       weight_tensor_slots;
-  std::unordered_map<layer_guid_t, std::vector<tensor_guid_t>>
+  std::unordered_map<layer_guid_t, std::vector<reduced_tensor_t>>
       output_tensor_slots;
-  std::unordered_map<layer_guid_t, std::vector<non_graph_tensor_guid_t>>
+  std::unordered_map<layer_guid_t, std::vector<reduced_tensor_t>>
       weight_optimizer_tensor_guids;
 
   // arguments
diff --git a/lib/local-execution/include/local-execution/local_task_argument_accessor.h b/lib/local-execution/include/local-execution/local_task_argument_accessor.h
index 1e1516a0de..db0e98c2b1 100644
--- a/lib/local-execution/include/local-execution/local_task_argument_accessor.h
+++ b/lib/local-execution/include/local-execution/local_task_argument_accessor.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H
 #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H
 
-#include "local-execution/slot_grad_id.dtg.h"
+#include "local-execution/slot_tensor_type_id.dtg.h"
 #include "local-execution/task_argument_accessor.h"
 #include <unordered_map>
 #include <variant>
@@ -9,7 +9,7 @@
 namespace FlexFlow {
 
 using TensorSlotsBacking = std::unordered_map<
-    SlotGradId,
+    SlotTensorTypeId,
     std::variant<GenericTensorAccessorW, std::vector<GenericTensorAccessorW>>>;
 using ArgSlotsBacking = std::unordered_map<slot_id_t, ConcreteArgSpec>;
 
@@ -25,9 +25,9 @@ struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor {
 
   GenericTensorAccessor get_tensor(slot_id_t slot,
                                    Permissions priv,
-                                   IsGrad is_grad) const override;
+                                   TensorType tensor_type) const override;
   VariadicGenericTensorAccessor get_variadic_tensor(
-      slot_id_t slot, Permissions priv, IsGrad is_grad) const override;
+      slot_id_t slot, Permissions priv, TensorType tensor_type) const override;
 
   Allocator get_allocator() const override;
 
@@ -40,7 +40,7 @@ struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor {
 };
 
 using TensorSlotsBackingWithoutAddresses = std::unordered_map<
-    SlotGradId,
+    SlotTensorTypeId,
     std::variant<std::pair<ArrayShape, DataType>,
                  std::vector<std::pair<ArrayShape, DataType>>>>;
 
diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h
index 6dfa8ad443..cbab4bf031 100644
--- a/lib/local-execution/include/local-execution/local_training_backing.h
+++ b/lib/local-execution/include/local-execution/local_training_backing.h
@@ -15,7 +15,8 @@ using PerLayerElapsedTime =
 struct LocalTrainingBacking {
   LocalTrainingBacking(Allocator const &,
                        ComputationGraph const &,
-                       TensorBackingMap const &,
+                       LayerTensorBackingMap const &allocated_forward_tensors,
+                       TensorBackingMap const &allocated_non_graph_tensors,
                        RuntimeArgConfig const &);
   void register_and_allocate_layer(layer_guid_t const &);
   void allocate_layer_optimizer_tensors(layer_guid_t const &,
@@ -24,17 +25,17 @@ struct LocalTrainingBacking {
   void execute_init(layer_guid_t const &);
   std::optional<float> execute_forward(layer_guid_t const &);
   void compute_loss(LossAttrs const &loss_attrs,
-                    tensor_guid_t const &logit_tensor,
-                    tensor_guid_t const &label_tensor);
+                    reduced_tensor_t const &logit_tensor,
+                    reduced_tensor_t const &label_tensor);
   std::optional<float> execute_backward(layer_guid_t const &);
   void execute_update(layer_guid_t const &, OptimizerAttrs const &);
 
-  TaskArgumentAccessor get_task_arg_accessor(TaskInvocation const &) const;
+  TaskArgumentAccessor
+      get_task_arg_accessor(TaskInvocation const &,
+                            std::optional<layer_guid_t> const &) const;
   TaskArgumentAccessor get_op_task_arg_accessor(OpTaskInvocation const &,
                                                 layer_guid_t const &) const;
 
-  void insert_tensor(tensor_guid_t const &, GenericTensorAccessorW const &);
-
 private:
   DeviceSpecificDeviceStates call_init_task_impl(task_id_t,
                                                  TaskArgumentAccessor const &);
diff --git a/lib/local-execution/include/local-execution/loss_functions.h b/lib/local-execution/include/local-execution/loss_functions.h
index 2298115d5d..4ce74da766 100644
--- a/lib/local-execution/include/local-execution/loss_functions.h
+++ b/lib/local-execution/include/local-execution/loss_functions.h
@@ -26,7 +26,7 @@ namespace FlexFlow {
 TaskImplFunction get_loss_bwd_task_impl();
 TaskSignature get_loss_bwd_signature();
 TaskInvocation
-    backward(LossAttrs const &, tensor_guid_t logit, tensor_guid_t label);
+    backward(LossAttrs const &, reduced_tensor_t logit, reduced_tensor_t label);
 
 } // namespace FlexFlow
 
diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h
index 14473ff26e..5cc13f0b40 100644
--- a/lib/local-execution/include/local-execution/model_training_instance.h
+++ b/lib/local-execution/include/local-execution/model_training_instance.h
@@ -12,11 +12,12 @@ using PerLayerElapsedTime =
 struct ModelTrainingInstance {
   ModelTrainingInstance(Allocator const &,
                         ComputationGraph const &,
-                        TensorBackingMap const &,
+                        LayerTensorBackingMap const &allocated_forward_tensors,
+                        TensorBackingMap const &allocated_non_graph_tensors,
                         RuntimeArgConfig const &,
                         LossAttrs const &,
-                        tensor_guid_t const & logit_tensor,
-                        tensor_guid_t const & label_tensor,
+                        reduced_tensor_t const &logit_tensor,
+                        reduced_tensor_t const &label_tensor,
                         OptimizerAttrs const &);
 
   void execute_init();
@@ -27,11 +28,11 @@ struct ModelTrainingInstance {
   ComputationGraph computation_graph;
   LocalTrainingBacking training_backing;
   LossAttrs loss_attrs;
-  tensor_guid_t logit_tensor;
-  tensor_guid_t label_tensor;
+  reduced_tensor_t logit_tensor;
+  reduced_tensor_t label_tensor;
   OptimizerAttrs optimizer_attrs;
 };
 
-}
+} // namespace FlexFlow
 
 #endif
diff --git a/lib/local-execution/include/local-execution/op_task_invocation.h b/lib/local-execution/include/local-execution/op_task_invocation.h
index 0f351c3a0e..6484981ebf 100644
--- a/lib/local-execution/include/local-execution/op_task_invocation.h
+++ b/lib/local-execution/include/local-execution/op_task_invocation.h
@@ -10,7 +10,7 @@
 #include "local-execution/op_tensor_spec.h"
 #include "local-execution/profiling.h"
 #include "local-execution/runtime_arg_ref.h"
-#include "local-execution/slot_grad_id.dtg.h"
+#include "local-execution/slot_tensor_type_id.dtg.h"
 #include "local-execution/task_id_t.dtg.h"
 #include "local-execution/variadic_tensor_ref.h"
 #include <typeindex>
@@ -84,14 +84,14 @@ struct OpTaskBinding {
   bool operator==(OpTaskBinding const &other) const;
   bool operator!=(OpTaskBinding const &other) const;
 
-  std::unordered_map<SlotGradId, OpTensorSpec> const &
+  std::unordered_map<SlotTensorTypeId, OpTensorSpec> const &
       get_tensor_bindings() const;
   std::unordered_map<slot_id_t, OpArgSpec> const &get_arg_bindings() const;
 
   void bind_from_forward(OpTaskBinding const &fwd);
 
 private:
-  std::unordered_map<SlotGradId, OpTensorSpec> tensor_bindings;
+  std::unordered_map<SlotTensorTypeId, OpTensorSpec> tensor_bindings;
   std::unordered_map<slot_id_t, OpArgSpec> arg_bindings;
 
 private:
diff --git a/lib/local-execution/include/local-execution/op_tensor_slot_spec.struct.toml b/lib/local-execution/include/local-execution/op_tensor_slot_spec.struct.toml
index 590dbe6362..54638a7eb6 100644
--- a/lib/local-execution/include/local-execution/op_tensor_slot_spec.struct.toml
+++ b/lib/local-execution/include/local-execution/op_tensor_slot_spec.struct.toml
@@ -11,7 +11,7 @@ includes = [
   "local-execution/slot_id_t.dtg.h",
   "local-execution/slot_type.dtg.h",
   "local-execution/tensor_role.dtg.h",
-  "local-execution/is_grad.dtg.h",
+  "local-execution/tensor_type.dtg.h",
   "local-execution/op_slot_options.dtg.h",
 ]
 
@@ -28,8 +28,8 @@ name = "tensor_role"
 type = "::FlexFlow::TensorRole"
 
 [[fields]]
-name = "is_grad"
-type = "::FlexFlow::IsGrad"
+name = "tensor_type"
+type = "::FlexFlow::TensorType"
 
 [[fields]]
 name = "slot_option"
diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h
index acf9b8a550..2eb480a0c1 100644
--- a/lib/local-execution/include/local-execution/optimizer.h
+++ b/lib/local-execution/include/local-execution/optimizer.h
@@ -14,21 +14,21 @@ namespace FlexFlow {
 TaskSignature get_update_signature(OptimizerAttrs const &);
 TaskInvocation get_update_invocation(
     OptimizerAttrs const &,
-    tensor_guid_t const &weight,
-    std::vector<non_graph_tensor_guid_t> const &grad_buffer_tensors);
+    reduced_tensor_t const &weight,
+    std::vector<reduced_tensor_t> const &grad_buffer_tensors);
 TaskImplFunction get_update_task_impl(OptimizerAttrs const &);
 
 TaskSignature get_sgd_update_signature();
 TaskInvocation sgd_update(SGDOptimizerAttrs const &,
-                          tensor_guid_t const &weight,
-                          non_graph_tensor_guid_t const &sgd_v);
+                          reduced_tensor_t const &weight,
+                          reduced_tensor_t const &sgd_v);
 TaskImplFunction get_sgd_update_task_impl();
 
 TaskSignature get_adam_update_signature();
 TaskInvocation adam_update(AdamOptimizerAttrs const &,
-                           tensor_guid_t const &weight,
-                           non_graph_tensor_guid_t const &adam_v,
-                           non_graph_tensor_guid_t const &adam_m);
+                           reduced_tensor_t const &weight,
+                           reduced_tensor_t const &adam_v,
+                           reduced_tensor_t const &adam_m);
 TaskImplFunction get_adam_update_task_impl();
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/include/local-execution/reduced_tensor_t.struct.toml b/lib/local-execution/include/local-execution/reduced_tensor_t.struct.toml
new file mode 100644
index 0000000000..726249c970
--- /dev/null
+++ b/lib/local-execution/include/local-execution/reduced_tensor_t.struct.toml
@@ -0,0 +1,13 @@
+namespace = "FlexFlow"
+name = "reduced_tensor_t"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "fmt",
+]
+
+
+[[fields]]
+name = "raw_index"
+type = "int"
diff --git a/lib/local-execution/include/local-execution/slot_grad_id.struct.toml b/lib/local-execution/include/local-execution/slot_tensor_type_id.struct.toml
similarity index 62%
rename from lib/local-execution/include/local-execution/slot_grad_id.struct.toml
rename to lib/local-execution/include/local-execution/slot_tensor_type_id.struct.toml
index 256091d272..b3b3a320c7 100644
--- a/lib/local-execution/include/local-execution/slot_grad_id.struct.toml
+++ b/lib/local-execution/include/local-execution/slot_tensor_type_id.struct.toml
@@ -1,5 +1,5 @@
 namespace = "FlexFlow"
-name = "SlotGradId"
+name = "SlotTensorTypeId"
 features = [
   "eq",
   "ord",
@@ -8,7 +8,7 @@ features = [
 ]
 
 includes = [
-  "local-execution/is_grad.dtg.h",
+  "local-execution/tensor_type.dtg.h",
   "local-execution/slot_id_t.dtg.h",
 ]
 
@@ -17,5 +17,5 @@ name = "slot_id"
 type = "::FlexFlow::slot_id_t"
 
 [[fields]]
-name = "is_grad"
-type = "::FlexFlow::IsGrad"
+name = "tensor_type"
+type = "::FlexFlow::TensorType"
diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h
index 54c8dfc5f1..29d5fb8fbe 100644
--- a/lib/local-execution/include/local-execution/task_argument_accessor.h
+++ b/lib/local-execution/include/local-execution/task_argument_accessor.h
@@ -8,6 +8,7 @@
 namespace FlexFlow {
 
 struct TaskArgumentAccessor {
+  // arguments
   template <typename T>
   T const &get_argument(slot_id_t slot) const {
     if constexpr (PerDeviceOpState::IsPartOfPerDeviceOpState_v<T>) {
@@ -24,6 +25,7 @@ struct TaskArgumentAccessor {
     return this->get_argument<T>(slot_id_t{slot});
   }
 
+  // tensors
   template <Permissions PRIV>
   privilege_mode_to_accessor<PRIV> get_tensor(int slot) const {
     return this->get_tensor<PRIV>(slot_id_t{slot});
@@ -32,7 +34,7 @@ struct TaskArgumentAccessor {
   template <Permissions PRIV>
   privilege_mode_to_accessor<PRIV> get_tensor(slot_id_t slot) const {
     return std::get<privilege_mode_to_accessor<PRIV>>(
-        this->ptr->get_tensor(slot, PRIV, IsGrad::NO));
+        this->ptr->get_tensor(slot, PRIV, TensorType::FORWARD));
   }
 
   template <Permissions PRIV>
@@ -43,9 +45,32 @@ struct TaskArgumentAccessor {
   template <Permissions PRIV>
   privilege_mode_to_accessor<PRIV> get_tensor_grad(slot_id_t slot) const {
     return std::get<privilege_mode_to_accessor<PRIV>>(
-        this->ptr->get_tensor(slot, PRIV, IsGrad::YES));
+        this->ptr->get_tensor(slot, PRIV, TensorType::GRADIENT));
   }
 
+  template <Permissions PRIV>
+  privilege_mode_to_accessor<PRIV> get_optimizer_tensor(int slot) const {
+    return this->get_tensor_grad<PRIV>(slot_id_t{slot});
+  }
+
+  template <Permissions PRIV>
+  privilege_mode_to_accessor<PRIV> get_optimizer_tensor(slot_id_t slot) const {
+    return std::get<privilege_mode_to_accessor<PRIV>>(
+        this->ptr->get_tensor(slot, PRIV, TensorType::OPTIMIZER));
+  }
+
+  template <Permissions PRIV>
+  privilege_mode_to_accessor<PRIV> get_non_graph_tensor(int slot) const {
+    return this->get_tensor_grad<PRIV>(slot_id_t{slot});
+  }
+
+  template <Permissions PRIV>
+  privilege_mode_to_accessor<PRIV> get_non_graph_tensor(slot_id_t slot) const {
+    return std::get<privilege_mode_to_accessor<PRIV>>(
+        this->ptr->get_tensor(slot, PRIV, TensorType::NON_GRAPH));
+  }
+
+  // variadic tensors
   template <Permissions PRIV>
   std::vector<privilege_mode_to_accessor<PRIV>>
       get_variadic_tensor(int slot) const {
@@ -56,7 +81,7 @@ struct TaskArgumentAccessor {
   std::vector<privilege_mode_to_accessor<PRIV>>
       get_variadic_tensor(slot_id_t slot) const {
     return std::get<std::vector<privilege_mode_to_accessor<PRIV>>>(
-        this->ptr->get_variadic_tensor(slot, PRIV, IsGrad::NO));
+        this->ptr->get_variadic_tensor(slot, PRIV, TensorType::FORWARD));
   }
 
   template <Permissions PRIV>
@@ -69,7 +94,33 @@ struct TaskArgumentAccessor {
   std::vector<privilege_mode_to_accessor<PRIV>>
       get_variadic_tensor_grad(slot_id_t slot) const {
     return std::get<std::vector<privilege_mode_to_accessor<PRIV>>>(
-        this->ptr->get_variadic_tensor(slot, PRIV, IsGrad::YES));
+        this->ptr->get_variadic_tensor(slot, PRIV, TensorType::GRADIENT));
+  }
+
+  template <Permissions PRIV>
+  std::vector<privilege_mode_to_accessor<PRIV>>
+      get_variadic_optimizer_tensor(int slot) const {
+    return this->get_variadic_tensor_grad<PRIV>(slot_id_t{slot});
+  }
+
+  template <Permissions PRIV>
+  std::vector<privilege_mode_to_accessor<PRIV>>
+      get_variadic_optimizer_tensor(slot_id_t slot) const {
+    return std::get<std::vector<privilege_mode_to_accessor<PRIV>>>(
+        this->ptr->get_variadic_tensor(slot, PRIV, TensorType::OPTIMIZER));
+  }
+
+  template <Permissions PRIV>
+  std::vector<privilege_mode_to_accessor<PRIV>>
+      get_variadic_non_graph_tensor(int slot) const {
+    return this->get_variadic_tensor_grad<PRIV>(slot_id_t{slot});
+  }
+
+  template <Permissions PRIV>
+  std::vector<privilege_mode_to_accessor<PRIV>>
+      get_variadic_non_graph_tensor(slot_id_t slot) const {
+    return std::get<std::vector<privilege_mode_to_accessor<PRIV>>>(
+        this->ptr->get_variadic_tensor(slot, PRIV, TensorType::NON_GRAPH));
   }
 
   Allocator get_allocator() const {
diff --git a/lib/local-execution/include/local-execution/task_binding.h b/lib/local-execution/include/local-execution/task_binding.h
index 96c96473e4..93461e2e55 100644
--- a/lib/local-execution/include/local-execution/task_binding.h
+++ b/lib/local-execution/include/local-execution/task_binding.h
@@ -1,12 +1,12 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H
 #define _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H
 
-#include "local-execution/slot_grad_id.dtg.h"
+#include "local-execution/reduced_tensor_t.dtg.h"
 #include "local-execution/slot_id_t.dtg.h"
+#include "local-execution/slot_tensor_type_id.dtg.h"
 #include "local-execution/task_arg_spec.dtg.h"
 #include "local-execution/task_id_t.dtg.h"
 #include "local-execution/task_signature.dtg.h"
-#include "local-execution/tensor_guid_spec.dtg.h"
 #include "utils/hash/unordered_map.h"
 
 namespace FlexFlow {
@@ -14,8 +14,8 @@ namespace FlexFlow {
 struct TaskBinding {
   TaskBinding() = default;
 
-  void bind(int, TensorGuidSpec const &);
-  void bind(slot_id_t, TensorGuidSpec const &);
+  void bind(int, TensorType const &, reduced_tensor_t const &);
+  void bind(slot_id_t, TensorType const &, reduced_tensor_t const &);
 
   template <typename T>
   void bind_arg(int name, T const &t) {
@@ -40,12 +40,12 @@ struct TaskBinding {
   bool operator==(TaskBinding const &other) const;
   bool operator!=(TaskBinding const &other) const;
 
-  std::unordered_map<SlotGradId, TensorGuidSpec> const &
+  std::unordered_map<SlotTensorTypeId, reduced_tensor_t> const &
       get_tensor_bindings() const;
   std::unordered_map<slot_id_t, TaskArgSpec> const &get_arg_bindings() const;
 
 private:
-  std::unordered_map<SlotGradId, TensorGuidSpec> tensor_bindings;
+  std::unordered_map<SlotTensorTypeId, reduced_tensor_t> tensor_bindings;
   std::unordered_map<slot_id_t, TaskArgSpec> arg_bindings;
 
 private:
diff --git a/lib/local-execution/include/local-execution/task_signature.h b/lib/local-execution/include/local-execution/task_signature.h
index 6da69f2441..b10edce6d4 100644
--- a/lib/local-execution/include/local-execution/task_signature.h
+++ b/lib/local-execution/include/local-execution/task_signature.h
@@ -10,11 +10,11 @@ TaskSignature make_empty_task_signature();
 
 void add_slot(TaskSignature &,
               int name,
-              IsGrad,
+              TensorType,
               SlotType slot_type = SlotType::TENSOR);
 void add_slot(TaskSignature &,
               slot_id_t name,
-              IsGrad,
+              TensorType,
               SlotType slot_type = SlotType::TENSOR);
 
 template <typename T>
diff --git a/lib/local-execution/include/local-execution/task_signature.struct.toml b/lib/local-execution/include/local-execution/task_signature.struct.toml
index ac408a7b68..7efb0c658a 100644
--- a/lib/local-execution/include/local-execution/task_signature.struct.toml
+++ b/lib/local-execution/include/local-execution/task_signature.struct.toml
@@ -7,7 +7,7 @@ features = [
 ]
 
 includes = [
-  "local-execution/tensor_guid_slot_spec.dtg.h",
+  "local-execution/tensor_type_slot_spec.dtg.h",
   "local-execution/slot_id_t.dtg.h",
   "<typeindex>",
   "<optional>"
@@ -30,4 +30,4 @@ type = "std::unordered_map<::FlexFlow::slot_id_t, std::type_index>"
 
 [[fields]]
 name = "tensor_guid_slots"
-type = "std::unordered_map<::FlexFlow::slot_id_t, ::FlexFlow::TensorGuidSlotSpec>"
+type = "std::unordered_map<::FlexFlow::slot_id_t, ::FlexFlow::TensorTypeSlotSpec>"
diff --git a/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml b/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml
deleted file mode 100644
index 9b7e9c14f9..0000000000
--- a/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml
+++ /dev/null
@@ -1,22 +0,0 @@
-namespace = "FlexFlow"
-name = "TensorGuidSlotSpec"
-features = [
-  "eq",
-  "fmt",
-  "hash",
-  "ord",
-]
-
-includes = [
-  "local-execution/slot_type.dtg.h",
-  "local-execution/is_grad.dtg.h",
-]
-
-[[fields]]
-name = "slot_type"
-type = "::FlexFlow::SlotType"
-
-[[fields]]
-name = "is_grad"
-type = "::FlexFlow::IsGrad"
-
diff --git a/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml b/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml
deleted file mode 100644
index 1d147f60e5..0000000000
--- a/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml
+++ /dev/null
@@ -1,23 +0,0 @@
-namespace = "FlexFlow"
-name = "TensorGuidSpec"
-features = [
-  "eq",
-  "fmt",
-  "hash",
-  "ord"
-]
-
-includes = [
-  "pcg/tensor_guid_t.dtg.h",
-  "local-execution/is_grad.dtg.h",
-  "local-execution/unified_tensor_guid.dtg.h"
-]
-
-[[fields]]
-name = "tensor_guid"
-type = "::FlexFlow::UnifiedTensorGuid"
-
-[[fields]]
-name = "is_grad"
-type = "::FlexFlow::IsGrad"
-
diff --git a/lib/local-execution/include/local-execution/tensor_reduction.h b/lib/local-execution/include/local-execution/tensor_reduction.h
new file mode 100644
index 0000000000..eb55b07ee4
--- /dev/null
+++ b/lib/local-execution/include/local-execution/tensor_reduction.h
@@ -0,0 +1,15 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_TENSOR_REDUCTION_H
+#define _FLEXFLOW_LOCAL_EXECUTION_TENSOR_REDUCTION_H
+
+#include "local-execution/reduced_tensor_t.dtg.h"
+#include "pcg/tensor_guid_t.dtg.h"
+
+namespace FlexFlow {
+
+reduced_tensor_t lower(tensor_guid_t const &);
+
+std::vector<reduced_tensor_t> lower(std::vector<tensor_guid_t> const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/local-execution/include/local-execution/tensor_type.enum.toml b/lib/local-execution/include/local-execution/tensor_type.enum.toml
new file mode 100644
index 0000000000..31ce5ba83a
--- /dev/null
+++ b/lib/local-execution/include/local-execution/tensor_type.enum.toml
@@ -0,0 +1,20 @@
+namespace = "FlexFlow"
+name = "TensorType"
+features = [
+  "hash",
+  "fmt",
+  "rapidcheck",
+  "json",
+]
+
+[[values]]
+name = "NON_GRAPH"
+
+[[values]]
+name = "FORWARD"
+
+[[values]]
+name = "GRADIENT"
+
+[[values]]
+name = "OPTIMIZER"
diff --git a/lib/local-execution/include/local-execution/tensor_type_slot_spec.struct.toml b/lib/local-execution/include/local-execution/tensor_type_slot_spec.struct.toml
new file mode 100644
index 0000000000..ceba809474
--- /dev/null
+++ b/lib/local-execution/include/local-execution/tensor_type_slot_spec.struct.toml
@@ -0,0 +1,26 @@
+namespace = "FlexFlow"
+name = "TensorTypeSlotSpec"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "ord",
+]
+
+includes = [
+  "local-execution/slot_type.dtg.h",
+  "local-execution/slot_id_t.dtg.h",
+  "local-execution/tensor_type.dtg.h",
+]
+
+[[fields]]
+name = "slot_id"
+type = "::FlexFlow::slot_id_t"
+
+[[fields]]
+name = "tensor_type"
+type = "::FlexFlow::TensorType"
+
+[[fields]]
+name = "slot_type"
+type = "::FlexFlow::SlotType"
diff --git a/lib/local-execution/include/local-execution/unified_tensor_guid.variant.toml b/lib/local-execution/include/local-execution/unified_tensor_guid.variant.toml
deleted file mode 100644
index 3d2cd8e45f..0000000000
--- a/lib/local-execution/include/local-execution/unified_tensor_guid.variant.toml
+++ /dev/null
@@ -1,21 +0,0 @@
-namespace = "FlexFlow"
-name = "UnifiedTensorGuid"
-features = [
-  "eq",
-  "ord",
-  "hash",
-  "fmt",
-]
-
-includes = [
-  "pcg/tensor_guid_t.dtg.h",
-  "local-execution/non_graph_tensor_guid_t.dtg.h",
-]
-
-[[values]]
-type = "::FlexFlow::tensor_guid_t"
-key = "tensor_guid"
-
-[[values]]
-type = "::FlexFlow::non_graph_tensor_guid_t"
-key = "non_graph_tensor_guid"
diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc
index 25abc72567..f1bb5a9a5b 100644
--- a/lib/local-execution/src/local_slots_backing.cc
+++ b/lib/local-execution/src/local_slots_backing.cc
@@ -1,4 +1,5 @@
 #include "local-execution/local_slots_backing.h"
+#include "local-execution/tensor_reduction.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "pcg/computation_graph.h"
 #include "utils/containers/contains_key.h"
@@ -7,9 +8,12 @@
 
 namespace FlexFlow {
 
-LocalSlotsBacking::LocalSlotsBacking(TensorBackingMap const &allocated_tensors,
-                                     RuntimeArgConfig const &runtime_arg_config)
-    : tensor_mapping(allocated_tensors),
+LocalSlotsBacking::LocalSlotsBacking(
+    LayerTensorBackingMap const &allocated_forward_tensors,
+    TensorBackingMap const &allocated_non_graph_tensors,
+    RuntimeArgConfig const &runtime_arg_config)
+    : tensor_mapping(allocated_forward_tensors),
+      non_graph_tensor_mapping(allocated_non_graph_tensors),
       runtime_arg_config(runtime_arg_config){};
 
 void LocalSlotsBacking::add_per_device_op_state(
@@ -18,13 +22,6 @@ void LocalSlotsBacking::add_per_device_op_state(
   this->per_device_op_states.insert({op_guid, device_state});
 }
 
-void LocalSlotsBacking::insert_into_tensor_mapping(
-    tensor_guid_t const &tensor, GenericTensorAccessorW const &tensor_backing) {
-  if (!contains_key(this->tensor_mapping, tensor)) {
-    this->tensor_mapping.insert({tensor, tensor_backing});
-  }
-}
-
 void LocalSlotsBacking::allocate_layer_tensors(
     layer_guid_t const &layer_guid,
     ComputationGraph const &computation_graph,
@@ -46,15 +43,15 @@ void LocalSlotsBacking::allocate_tensors_by_role(
   switch (role) {
     case TensorRole::INPUT:
       tensors = get_incoming_inputs(computation_graph, layer_guid);
-      this->input_tensor_slots.insert({layer_guid, tensors});
+      this->input_tensor_slots.insert({layer_guid, lower(tensors)});
       break;
     case TensorRole::WEIGHT:
       tensors = get_incoming_weights(computation_graph, layer_guid);
-      this->weight_tensor_slots.insert({layer_guid, tensors});
+      this->weight_tensor_slots.insert({layer_guid, lower(tensors)});
       break;
     case TensorRole::OUTPUT:
       tensors = get_outgoing_tensors(computation_graph, layer_guid);
-      this->output_tensor_slots.insert({layer_guid, tensors});
+      this->output_tensor_slots.insert({layer_guid, lower(tensors)});
       break;
     default:
       throw mk_runtime_error("Invalid tensor role, got {}", role);
@@ -62,19 +59,22 @@ void LocalSlotsBacking::allocate_tensors_by_role(
 
   for (tensor_guid_t const &tensor : tensors) {
     TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor);
+    reduced_tensor_t reduced_tensor = lower(tensor);
+    LayerTensorKey layer_tensor_key =
+        LayerTensorKey{layer_guid, reduced_tensor};
     // tensor allocation
-    if (!is_tensor_allocated(tensor)) {
+    if (!is_forward_tensor_allocated(layer_tensor_key)) {
       GenericTensorAccessorW tensor_backing =
           allocator.allocate_tensor(tensor_attrs.shape);
-      this->tensor_mapping.insert({tensor, tensor_backing});
+      this->tensor_mapping.insert({layer_tensor_key, tensor_backing});
     }
 
     // gradient tensor allocation
-    if (tensor_attrs.create_gradients == CreateGrad::YES &&
-        !is_gradient_tensor_allocated(tensor)) {
+    if (tensor_attrs.create_gradients == CreateGrad::YES) {
       GenericTensorAccessorW gradient_tensor_backing =
           allocator.allocate_tensor(tensor_attrs.shape);
-      this->gradient_tensor_mapping.insert({tensor, gradient_tensor_backing});
+      this->gradient_tensor_mapping.insert(
+          {layer_tensor_key, gradient_tensor_backing});
     }
   }
 }
@@ -85,53 +85,52 @@ void LocalSlotsBacking::allocate_optimizer_tensors(
     ComputationGraph const &cg,
     Allocator &allocator,
     TaskSignature const &sig) {
-  GenericTensorAccessorW weight_backing =
-      get_tensor_backing(UnifiedTensorGuid{weight}, IsGrad::NO);
+  GenericTensorAccessorW weight_backing = this->get_tensor_backing(
+      TensorType::FORWARD, lower(weight), weight_layer);
   int num_grad_buffer_tensors =
       sig.tensor_guid_slots.size() - 2; // ignore 2 (weight and weight_grad)
-  std::vector<non_graph_tensor_guid_t> grad_buffer_tensors;
+  std::vector<reduced_tensor_t> optimizer_buffer_tensors;
   for (int i = 0; i < num_grad_buffer_tensors; ++i) {
-    non_graph_tensor_guid_t buffer_tensor_guid = non_graph_tensor_guid_t{i};
+    reduced_tensor_t buffer_tensor = reduced_tensor_t{i};
     GenericTensorAccessorW buffer_backing = allocator.allocate_tensor(
         get_tensor_shape(weight_backing.shape, weight_backing.data_type));
-    this->optimizer_tensor_mapping.insert({buffer_tensor_guid, buffer_backing});
-    grad_buffer_tensors.push_back(buffer_tensor_guid);
+    this->optimizer_tensor_mapping.insert(
+        {LayerTensorKey{weight_layer, buffer_tensor}, buffer_backing});
+    optimizer_buffer_tensors.push_back(buffer_tensor);
   }
   this->weight_optimizer_tensor_guids.insert(
-      {weight_layer, grad_buffer_tensors});
+      {weight_layer, optimizer_buffer_tensors});
 }
 
-bool LocalSlotsBacking::is_tensor_allocated(
-    tensor_guid_t const &tensor_id) const {
-  return contains_key(this->tensor_mapping, tensor_id);
+bool LocalSlotsBacking::is_forward_tensor_allocated(
+    LayerTensorKey const &layer_tensor_id) const {
+  return contains_key(this->tensor_mapping, layer_tensor_id);
 }
 
-bool LocalSlotsBacking::is_gradient_tensor_allocated(
-    tensor_guid_t const &tensor_id) const {
-  return contains_key(this->gradient_tensor_mapping, tensor_id);
+bool LocalSlotsBacking::is_non_graph_tensor_allocated(
+    reduced_tensor_t const &tensor_id) const {
+  return contains_key(this->non_graph_tensor_mapping, tensor_id);
 }
 
-GenericTensorAccessorW const &
-    LocalSlotsBacking::get_tensor_backing(UnifiedTensorGuid const &tensor_id,
-                                          IsGrad is_grad) const {
-  if (tensor_id.has<tensor_guid_t>()) {
-    tensor_guid_t graph_tensor_guid = tensor_id.get<tensor_guid_t>();
-    switch (is_grad) {
-      case IsGrad::NO:
-        assert(contains_key(this->tensor_mapping, graph_tensor_guid));
-        return this->tensor_mapping.at(graph_tensor_guid);
-      case IsGrad::YES:
-        assert(contains_key(this->gradient_tensor_mapping, graph_tensor_guid));
-        return this->gradient_tensor_mapping.at(graph_tensor_guid);
-      default:
-        throw mk_runtime_error(fmt::format(
-            "IsGrad should only have YES or NO, received {}", is_grad));
-    }
-  } else {
-    non_graph_tensor_guid_t non_graph_tensor_guid =
-        tensor_id.get<non_graph_tensor_guid_t>();
-    assert(contains_key(this->optimizer_tensor_mapping, non_graph_tensor_guid));
-    return this->optimizer_tensor_mapping.at(non_graph_tensor_guid);
+GenericTensorAccessorW const &LocalSlotsBacking::get_tensor_backing(
+    TensorType const &tensor_type,
+    reduced_tensor_t const &tensor_id,
+    std::optional<layer_guid_t> const &layer_guid) const {
+  switch (tensor_type) {
+    case TensorType::FORWARD:
+      return this->tensor_mapping.at(
+          LayerTensorKey{layer_guid.value(), tensor_id});
+    case TensorType::NON_GRAPH:
+      return this->non_graph_tensor_mapping.at(tensor_id);
+    case TensorType::GRADIENT:
+      return this->gradient_tensor_mapping.at(
+          LayerTensorKey{layer_guid.value(), tensor_id});
+    case TensorType::OPTIMIZER:
+      return this->optimizer_tensor_mapping.at(
+          LayerTensorKey{layer_guid.value(), tensor_id});
+    default:
+      throw mk_runtime_error(
+          fmt::format("Invalid tensor type {}", tensor_type));
   }
 }
 
@@ -140,9 +139,9 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing(
   TensorSlotsBacking mapping;
 
   for (auto const &tensor_binding : binding.get_tensor_bindings()) {
-    SlotGradId slot_grad_id = tensor_binding.first;
+    SlotTensorTypeId slot_grad_id = tensor_binding.first;
     OpTensorSpec tensor_spec = tensor_binding.second;
-    std::vector<tensor_guid_t> tensor_guids;
+    std::vector<reduced_tensor_t> tensor_guids;
     int weight_adjusted_idx = 0;
     switch (tensor_spec.role) {
       case TensorRole::WEIGHT:
@@ -162,26 +161,25 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing(
             fmt::format("Invalid TensorRole {}", tensor_spec.role));
     }
 
-    IsGrad is_grad = slot_grad_id.is_grad;
-    GenericTensorAccessorW tensor_backing = this->get_tensor_backing(
-        UnifiedTensorGuid{tensor_guids.at(tensor_spec.idx)}, is_grad);
-
-    mapping.insert({slot_grad_id, tensor_backing});
+    mapping.insert({slot_grad_id,
+                    this->get_tensor_backing(slot_grad_id.tensor_type,
+                                             tensor_guids.at(tensor_spec.idx),
+                                             op_guid)});
   }
   return mapping;
 }
 
 TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing(
-    TaskBinding const &binding) const {
+    TaskBinding const &binding,
+    std::optional<layer_guid_t> const &layer_guid) const {
   TensorSlotsBacking mapping;
 
   for (auto const &tensor_binding : binding.get_tensor_bindings()) {
-    SlotGradId slot_grad_id = tensor_binding.first;
-    TensorGuidSpec tensor_spec = tensor_binding.second;
-
+    reduced_tensor_t tensor_id = tensor_binding.second;
+    SlotTensorTypeId slot_tensor_type_id = tensor_binding.first;
     GenericTensorAccessorW accessor = this->get_tensor_backing(
-        UnifiedTensorGuid{tensor_spec.tensor_guid}, slot_grad_id.is_grad);
-    mapping.insert({slot_grad_id, accessor});
+        slot_tensor_type_id.tensor_type, tensor_id, layer_guid);
+    mapping.insert({slot_tensor_type_id, accessor});
   }
 
   return mapping;
@@ -229,13 +227,14 @@ ConcreteArgSpec LocalSlotsBacking::resolve_op_arg_ref_spec(
         op_arg_ref_spec.get_ref_type().get<ParallelTensorShapeRefType>();
 
     assert(contains_key(this->input_tensor_slots, op_guid));
-    std::vector<tensor_guid_t> input_tensor_guids =
+    std::vector<reduced_tensor_t> input_tensor_guids =
         this->input_tensor_slots.at(op_guid);
 
     assert(input_tensor_guids.size() > index_op_arg_ref.idx);
-    GenericTensorAccessorW tensor_backing = this->get_tensor_backing(
-        UnifiedTensorGuid{input_tensor_guids.at(index_op_arg_ref.idx)},
-        IsGrad::NO);
+    GenericTensorAccessorW tensor_backing =
+        this->get_tensor_backing(TensorType::FORWARD,
+                                 input_tensor_guids.at(index_op_arg_ref.idx),
+                                 op_guid);
     ParallelTensorShape shape = lift_to_parallel(
         get_tensor_shape(tensor_backing.shape, tensor_backing.data_type));
     return ConcreteArgSpec::create(shape);
diff --git a/lib/local-execution/src/local_task_argument_accessor.cc b/lib/local-execution/src/local_task_argument_accessor.cc
index 5d0156201e..75479a1f88 100644
--- a/lib/local-execution/src/local_task_argument_accessor.cc
+++ b/lib/local-execution/src/local_task_argument_accessor.cc
@@ -19,10 +19,10 @@ ConcreteArgSpec const &
 }
 
 GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor(
-    slot_id_t slot, Permissions priv, IsGrad is_grad) const {
-  SlotGradId slot_grad_pair = SlotGradId{slot, is_grad};
+    slot_id_t slot, Permissions priv, TensorType tensor_type) const {
+  SlotTensorTypeId slot_tensor_type = SlotTensorTypeId{slot, tensor_type};
   auto tensor_backing = std::get<GenericTensorAccessorW>(
-      this->tensor_slots_backing.at(slot_grad_pair));
+      this->tensor_slots_backing.at(slot_tensor_type));
   if (priv == Permissions::RO) {
     GenericTensorAccessorR readonly_tensor_backing = {
         tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr};
@@ -34,10 +34,10 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor(
   }
 }
 VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor(
-    slot_id_t slot, Permissions priv, IsGrad is_grad) const {
-  SlotGradId slot_grad_pair = SlotGradId{slot, is_grad};
+    slot_id_t slot, Permissions priv, TensorType tensor_type) const {
+  SlotTensorTypeId slot_tensor_type = SlotTensorTypeId{slot, tensor_type};
   auto variadic_tensor_backing = std::get<std::vector<GenericTensorAccessorW>>(
-      this->tensor_slots_backing.at(slot_grad_pair));
+      this->tensor_slots_backing.at(slot_tensor_type));
   if (priv == Permissions::RO) {
     std::vector<GenericTensorAccessorR> readonly_variadic_tensor_backing = {};
     for (GenericTensorAccessorW const &tensor_backing :
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index 0cb8146467..e432b1afe9 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -3,6 +3,7 @@
 #include "local-execution/optimizer.h"
 #include "local-execution/task_invocation.h"
 #include "local-execution/task_signature_impl.h"
+#include "local-execution/tensor_reduction.h"
 #include "pcg/computation_graph.h"
 #include "pcg/optimizer_attrs.h"
 #include "utils/containers/contains.h"
@@ -15,10 +16,13 @@ namespace FlexFlow {
 LocalTrainingBacking::LocalTrainingBacking(
     Allocator const &allocator,
     ComputationGraph const &computation_graph,
-    TensorBackingMap const &tensor_backing_mapping,
+    LayerTensorBackingMap const &allocated_forward_tensors,
+    TensorBackingMap const &allocated_non_graph_tensors,
     RuntimeArgConfig const &runtime_arg_config)
     : allocator(allocator), computation_graph(computation_graph),
-      local_slots_backing(tensor_backing_mapping, runtime_arg_config),
+      local_slots_backing(allocated_forward_tensors,
+                          allocated_non_graph_tensors,
+                          runtime_arg_config),
       task_registry(empty_task_registry()) {}
 
 void LocalTrainingBacking::register_and_allocate_layer(
@@ -96,15 +100,16 @@ std::optional<float>
 }
 
 void LocalTrainingBacking::compute_loss(LossAttrs const &loss_attrs,
-                                        tensor_guid_t const &logit_tensor,
-                                        tensor_guid_t const &label_tensor) {
-  assert(this->local_slots_backing.is_tensor_allocated(logit_tensor) &&
-         this->local_slots_backing.is_tensor_allocated(label_tensor));
+                                        reduced_tensor_t const &logit_tensor,
+                                        reduced_tensor_t const &label_tensor) {
+  assert(
+      this->local_slots_backing.is_non_graph_tensor_allocated(logit_tensor) &&
+      this->local_slots_backing.is_non_graph_tensor_allocated(label_tensor));
   TaskInvocation loss_invocation =
       backward(loss_attrs, logit_tensor, label_tensor);
   // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation));
   TaskArgumentAccessor loss_accessor =
-      this->get_task_arg_accessor(loss_invocation);
+      this->get_task_arg_accessor(loss_invocation, std::nullopt);
   TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl();
   loss_impl_fn.get<GenericTaskImplFunction>().function_ptr(loss_accessor);
 }
@@ -130,28 +135,30 @@ void LocalTrainingBacking::execute_update(
   LayerAttrs layer_attrs = get_layer_attrs(this->computation_graph, node);
   if (layer_attrs.attrs.has<WeightAttrs>()) {
     // get tensors
-    tensor_guid_t weight_tensor =
-        get_only(get_outgoing_tensors(this->computation_graph, node));
-    std::vector<non_graph_tensor_guid_t> grad_buffer_tensors =
+    reduced_tensor_t weight_tensor =
+        lower(get_only(get_outgoing_tensors(this->computation_graph, node)));
+    std::vector<reduced_tensor_t> optimizer_buffer_tensors =
         this->local_slots_backing.weight_optimizer_tensor_guids.at(node);
 
     // get invocation
     TaskInvocation invocation = get_update_invocation(
-        optimizer_attrs, weight_tensor, grad_buffer_tensors);
+        optimizer_attrs, weight_tensor, optimizer_buffer_tensors);
     // assert(is_invocation_valid(get_update_signature(attrs), invocation));
 
     // execute update
-    TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation);
+    TaskArgumentAccessor accessor =
+        this->get_task_arg_accessor(invocation, node);
     TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs);
     update_impl_fn.get<GenericTaskImplFunction>().function_ptr(accessor);
   }
 }
 
 TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor(
-    TaskInvocation const &invocation) const {
+    TaskInvocation const &invocation,
+    std::optional<layer_guid_t> const &layer_guid) const {
   TensorSlotsBacking tensor_slots_backing =
       this->local_slots_backing.construct_tensor_slots_backing(
-          invocation.binding);
+          invocation.binding, layer_guid);
   ArgSlotsBacking arg_slots_backing =
       this->local_slots_backing.construct_arg_slots_backing(invocation.binding);
   return TaskArgumentAccessor::create<LocalTaskArgumentAccessor>(
@@ -171,9 +178,4 @@ TaskArgumentAccessor LocalTrainingBacking::get_op_task_arg_accessor(
       this->allocator, tensor_slots_backing, arg_slots_backing);
 }
 
-void LocalTrainingBacking::insert_tensor(
-    tensor_guid_t const &tensor, GenericTensorAccessorW const &tensor_backing) {
-  this->local_slots_backing.insert_into_tensor_mapping(tensor, tensor_backing);
-}
-
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc
index a37c1d706b..e54841acb5 100644
--- a/lib/local-execution/src/loss_functions.cc
+++ b/lib/local-execution/src/loss_functions.cc
@@ -24,20 +24,23 @@ enum Slots { LOGIT, LABEL, ATTRS, PROFILING };
 
 TaskSignature get_loss_bwd_signature() {
   TaskSignature sig = make_empty_task_signature();
-  add_slot(sig, LOGIT, IsGrad::NO);
-  add_slot(sig, LABEL, IsGrad::NO);
-  add_slot(sig, LOGIT, IsGrad::YES);
+  add_slot(sig, LOGIT, TensorType::NON_GRAPH);
+  add_slot(sig, LABEL, TensorType::NON_GRAPH);
+  add_slot(sig, LOGIT, TensorType::GRADIENT);
+
   add_arg_slot<LossAttrs>(sig, ATTRS);
   add_arg_slot<ProfilingSettings>(sig, PROFILING);
   return sig;
 }
 
-TaskInvocation
-    backward(LossAttrs const &attrs, tensor_guid_t logit, tensor_guid_t label) {
+TaskInvocation backward(LossAttrs const &attrs,
+                        reduced_tensor_t logit,
+                        reduced_tensor_t label) {
   TaskBinding b;
-  b.bind(LOGIT, TensorGuidSpec{UnifiedTensorGuid{logit}, IsGrad::NO});
-  b.bind(LABEL, TensorGuidSpec{UnifiedTensorGuid{label}, IsGrad::NO});
-  b.bind(LOGIT, TensorGuidSpec{UnifiedTensorGuid{logit}, IsGrad::YES});
+  b.bind(LOGIT, TensorType::NON_GRAPH, logit);
+  b.bind(LABEL, TensorType::NON_GRAPH, label);
+  b.bind(LOGIT, TensorType::GRADIENT, logit);
+
   b.bind_arg(ATTRS, attrs);
   b.bind_arg(PROFILING, profiling_settings());
 
diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc
index abdced1bb5..5a58e4c524 100644
--- a/lib/local-execution/src/model_training_instance.cc
+++ b/lib/local-execution/src/model_training_instance.cc
@@ -1,59 +1,77 @@
 #include "local-execution/model_training_instance.h"
 #include "pcg/computation_graph.h"
-#include "utils/containers/reversed.h"
 #include "pcg/optimizer_attrs.h"
+#include "utils/containers/reversed.h"
 
 namespace FlexFlow {
-  
-ModelTrainingInstance::ModelTrainingInstance(Allocator const & allocator, 
-                                             ComputationGraph const & computation_graph, 
-                                             TensorBackingMap const & tensor_backing_map, 
-                                             RuntimeArgConfig const & runtime_arg_config, 
-                                             LossAttrs const & loss_attrs, 
-                                             tensor_guid_t const &logit_tensor, 
-                                             tensor_guid_t const &label_tensor, 
-                                             OptimizerAttrs const & optimizer_attrs) 
-  : computation_graph(computation_graph), training_backing(allocator, computation_graph, tensor_backing_map, runtime_arg_config),
-  loss_attrs(loss_attrs), logit_tensor(logit_tensor), label_tensor(label_tensor), optimizer_attrs(optimizer_attrs) {
+
+ModelTrainingInstance::ModelTrainingInstance(
+    Allocator const &allocator,
+    ComputationGraph const &computation_graph,
+    LayerTensorBackingMap const &allocated_forward_tensors,
+    TensorBackingMap const &allocated_non_graph_tensors,
+    RuntimeArgConfig const &runtime_arg_config,
+    LossAttrs const &loss_attrs,
+    reduced_tensor_t const &logit_tensor,
+    reduced_tensor_t const &label_tensor,
+    OptimizerAttrs const &optimizer_attrs)
+    : computation_graph(computation_graph),
+      training_backing(allocator,
+                       computation_graph,
+                       allocated_forward_tensors,
+                       allocated_non_graph_tensors,
+                       runtime_arg_config),
+      loss_attrs(loss_attrs), logit_tensor(logit_tensor),
+      label_tensor(label_tensor), optimizer_attrs(optimizer_attrs) {
 
   // allocate each layer's tensors
-  for (layer_guid_t const & node: topological_ordering(this->computation_graph)) {
+  for (layer_guid_t const &node :
+       topological_ordering(this->computation_graph)) {
     this->training_backing.register_and_allocate_layer(node);
-    this->training_backing.allocate_layer_optimizer_tensors(node, this->optimizer_attrs);
+    this->training_backing.allocate_layer_optimizer_tensors(
+        node, this->optimizer_attrs);
   }
 }
 
 void ModelTrainingInstance::execute_init() {
-  for (layer_guid_t const & node: topological_ordering(this->computation_graph)) {
+  for (layer_guid_t const &node :
+       topological_ordering(this->computation_graph)) {
     this->training_backing.execute_init(node);
   }
 }
 
 PerLayerElapsedTime ModelTrainingInstance::execute_forward() {
   PerLayerElapsedTime per_layer_elapsed_time;
-  for (layer_guid_t const & node: topological_ordering(this->computation_graph)) {
-    std::optional<float> elapsed_time = this->training_backing.execute_forward(node);
+  for (layer_guid_t const &node :
+       topological_ordering(this->computation_graph)) {
+    std::optional<float> elapsed_time =
+        this->training_backing.execute_forward(node);
     per_layer_elapsed_time.insert({node, elapsed_time});
   }
   return per_layer_elapsed_time;
 }
 
 PerLayerElapsedTime ModelTrainingInstance::execute_backward() {
-  this->training_backing.compute_loss(this->loss_attrs, this->logit_tensor, this->label_tensor);
-  
+  this->training_backing.compute_loss(
+      this->loss_attrs, this->logit_tensor, this->label_tensor);
+
   PerLayerElapsedTime per_layer_elapsed_time;
-  for (layer_guid_t const & node: reversed(topological_ordering(this->computation_graph))) {
-    std::optional<float> elapsed_time = this->training_backing.execute_backward(node);
+  for (layer_guid_t const &node :
+       reversed(topological_ordering(this->computation_graph))) {
+    std::optional<float> elapsed_time =
+        this->training_backing.execute_backward(node);
     per_layer_elapsed_time.insert({node, elapsed_time});
   }
   return per_layer_elapsed_time;
 }
 
 void ModelTrainingInstance::execute_update() {
-  for (layer_guid_t const & node: topological_ordering(this->computation_graph)) {
+  for (layer_guid_t const &node :
+       topological_ordering(this->computation_graph)) {
     this->training_backing.execute_update(node, this->optimizer_attrs);
   }
-  this->optimizer_attrs = get_next_iteration_optimizer_attrs(this->optimizer_attrs);
+  this->optimizer_attrs =
+      get_next_iteration_optimizer_attrs(this->optimizer_attrs);
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/op_task_invocation.cc b/lib/local-execution/src/op_task_invocation.cc
index 19c8894b05..81bf185911 100644
--- a/lib/local-execution/src/op_task_invocation.cc
+++ b/lib/local-execution/src/op_task_invocation.cc
@@ -20,7 +20,8 @@ void OpTaskBinding::bind(int slot, OpTensorSpec const &tensor_spec) {
 }
 
 void OpTaskBinding::bind(slot_id_t slot, OpTensorSpec const &tensor_spec) {
-  this->tensor_bindings.insert({SlotGradId{slot, IsGrad::NO}, tensor_spec});
+  this->tensor_bindings.insert(
+      {SlotTensorTypeId{slot, TensorType::FORWARD}, tensor_spec});
 }
 
 void OpTaskBinding::bind_grad(int slot, OpTensorSpec const &tensor_spec) {
@@ -28,7 +29,8 @@ void OpTaskBinding::bind_grad(int slot, OpTensorSpec const &tensor_spec) {
 }
 
 void OpTaskBinding::bind_grad(slot_id_t slot, OpTensorSpec const &tensor_spec) {
-  this->tensor_bindings.insert({SlotGradId{slot, IsGrad::YES}, tensor_spec});
+  this->tensor_bindings.insert(
+      {SlotTensorTypeId{slot, TensorType::GRADIENT}, tensor_spec});
 }
 
 void OpTaskBinding::insert_arg_spec(slot_id_t name, OpArgSpec const &arg_spec) {
@@ -44,13 +46,13 @@ bool OpTaskBinding::operator!=(OpTaskBinding const &other) const {
   return this->tie() != other.tie();
 }
 
-std::tuple<std::unordered_map<SlotGradId, OpTensorSpec> const &,
+std::tuple<std::unordered_map<SlotTensorTypeId, OpTensorSpec> const &,
            std::unordered_map<slot_id_t, OpArgSpec> const &>
     OpTaskBinding::tie() const {
   return std::tie(this->tensor_bindings, this->arg_bindings);
 }
 
-std::unordered_map<SlotGradId, OpTensorSpec> const &
+std::unordered_map<SlotTensorTypeId, OpTensorSpec> const &
     OpTaskBinding::get_tensor_bindings() const {
   return this->tensor_bindings;
 }
@@ -89,8 +91,8 @@ bool is_tensor_invocation_valid(OpTaskSignature const &sig,
                                 OpTaskInvocation const &inv) {
   auto tensor_bindings = inv.binding.get_tensor_bindings();
   for (OpTensorSlotSpec const &op_tensor_slot_spec : sig.get_tensor_slots()) {
-    SlotGradId tensor_key =
-        SlotGradId{op_tensor_slot_spec.name, op_tensor_slot_spec.is_grad};
+    SlotTensorTypeId tensor_key = SlotTensorTypeId{
+        op_tensor_slot_spec.name, op_tensor_slot_spec.tensor_type};
     OpTensorSpec op_tensor_spec = tensor_bindings.at(tensor_key);
     if (is_op_tensor_spec_invalid(op_tensor_slot_spec, op_tensor_spec)) {
       return false;
diff --git a/lib/local-execution/src/op_task_signature.cc b/lib/local-execution/src/op_task_signature.cc
index 932b330453..5c8b19265a 100644
--- a/lib/local-execution/src/op_task_signature.cc
+++ b/lib/local-execution/src/op_task_signature.cc
@@ -12,8 +12,12 @@ void OpTaskSignature::add_input_slot(int name, SlotType slot_type) {
 }
 
 void OpTaskSignature::add_input_slot(slot_id_t name, SlotType slot_type) {
-  OpTensorSlotSpec op_tensor_slot_spec = OpTensorSlotSpec{
-      name, slot_type, TensorRole::INPUT, IsGrad::NO, OpSlotOptions::NECESSARY};
+  OpTensorSlotSpec op_tensor_slot_spec =
+      OpTensorSlotSpec{name,
+                       slot_type,
+                       TensorRole::INPUT,
+                       TensorType::FORWARD,
+                       OpSlotOptions::NECESSARY};
   this->op_tensor_slots.insert(op_tensor_slot_spec);
 }
 
@@ -23,8 +27,12 @@ void OpTaskSignature::add_optional_input_slot(int name, SlotType slot_type) {
 
 void OpTaskSignature::add_optional_input_slot(slot_id_t name,
                                               SlotType slot_type) {
-  OpTensorSlotSpec op_tensor_slot_spec = OpTensorSlotSpec{
-      name, slot_type, TensorRole::INPUT, IsGrad::NO, OpSlotOptions::OPTIONAL};
+  OpTensorSlotSpec op_tensor_slot_spec =
+      OpTensorSlotSpec{name,
+                       slot_type,
+                       TensorRole::INPUT,
+                       TensorType::FORWARD,
+                       OpSlotOptions::OPTIONAL};
   this->op_tensor_slots.insert(op_tensor_slot_spec);
 }
 
@@ -38,7 +46,7 @@ void OpTaskSignature::add_untrainable_input_slot(slot_id_t name,
       OpTensorSlotSpec{name,
                        slot_type,
                        TensorRole::INPUT,
-                       IsGrad::NO,
+                       TensorType::FORWARD,
                        OpSlotOptions::UNTRAINABLE};
   this->op_tensor_slots.insert(op_tensor_slot_spec);
 }
@@ -54,7 +62,7 @@ void OpTaskSignature::add_optional_untrainable_input_slot(slot_id_t name,
       OpTensorSlotSpec{name,
                        slot_type,
                        TensorRole::INPUT,
-                       IsGrad::NO,
+                       TensorType::FORWARD,
                        OpSlotOptions::OPTIONAL_UNTRAINABLE};
   this->op_tensor_slots.insert(op_tensor_slot_spec);
 }
@@ -68,7 +76,7 @@ void OpTaskSignature::add_output_slot(slot_id_t name, SlotType slot_type) {
       OpTensorSlotSpec{name,
                        slot_type,
                        TensorRole::OUTPUT,
-                       IsGrad::NO,
+                       TensorType::FORWARD,
                        OpSlotOptions::NECESSARY};
   this->op_tensor_slots.insert(op_tensor_slot_spec);
 }
@@ -80,8 +88,12 @@ void OpTaskSignature::add_bwd_optional_output_slot(int name,
 
 void OpTaskSignature::add_bwd_optional_output_slot(slot_id_t name,
                                                    SlotType slot_type) {
-  OpTensorSlotSpec op_tensor_slot_spec = OpTensorSlotSpec{
-      name, slot_type, TensorRole::OUTPUT, IsGrad::NO, OpSlotOptions::OPTIONAL};
+  OpTensorSlotSpec op_tensor_slot_spec =
+      OpTensorSlotSpec{name,
+                       slot_type,
+                       TensorRole::OUTPUT,
+                       TensorType::FORWARD,
+                       OpSlotOptions::OPTIONAL};
   this->op_tensor_slots.insert(op_tensor_slot_spec);
 }
 
@@ -94,7 +106,7 @@ void OpTaskSignature::add_weight_slot(slot_id_t name, SlotType slot_type) {
       OpTensorSlotSpec{name,
                        slot_type,
                        TensorRole::WEIGHT,
-                       IsGrad::NO,
+                       TensorType::FORWARD,
                        OpSlotOptions::NECESSARY};
   this->op_tensor_slots.insert(op_tensor_slot_spec);
 }
@@ -105,8 +117,12 @@ void OpTaskSignature::add_optional_weight_slot(int name, SlotType slot_type) {
 
 void OpTaskSignature::add_optional_weight_slot(slot_id_t name,
                                                SlotType slot_type) {
-  OpTensorSlotSpec op_tensor_slot_spec = OpTensorSlotSpec{
-      name, slot_type, TensorRole::WEIGHT, IsGrad::NO, OpSlotOptions::OPTIONAL};
+  OpTensorSlotSpec op_tensor_slot_spec =
+      OpTensorSlotSpec{name,
+                       slot_type,
+                       TensorRole::WEIGHT,
+                       TensorType::FORWARD,
+                       OpSlotOptions::OPTIONAL};
   this->op_tensor_slots.insert(op_tensor_slot_spec);
 }
 
@@ -130,7 +146,7 @@ OpTaskSignature infer_bwd_signature(OpTaskSignature const &fwd) {
           OpTensorSlotSpec{op_tensor_slot_spec.name,
                            op_tensor_slot_spec.slot_type,
                            op_tensor_slot_spec.tensor_role,
-                           IsGrad::YES,
+                           TensorType::GRADIENT,
                            op_tensor_slot_spec.slot_option};
       bwd.op_tensor_slots.insert(grad_spec);
     }
diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc
index 1e06dee96a..5c0d6c54f2 100644
--- a/lib/local-execution/src/optimizer.cc
+++ b/lib/local-execution/src/optimizer.cc
@@ -9,9 +9,10 @@ enum Slots { ATTRS, WEIGHT, SGD_V, PROFILING, ADAM_M, ADAM_V, HANDLE };
 
 TaskSignature get_sgd_update_signature() {
   TaskSignature sig = make_empty_task_signature();
-  add_slot(sig, WEIGHT, IsGrad::YES);
-  add_slot(sig, WEIGHT, IsGrad::NO);
-  add_slot(sig, SGD_V, IsGrad::YES);
+  add_slot(sig, WEIGHT, TensorType::FORWARD);
+  add_slot(sig, WEIGHT, TensorType::GRADIENT);
+  add_slot(sig, SGD_V, TensorType::OPTIMIZER);
+
   add_arg_slot<SGDOptimizerAttrs>(sig, ATTRS);
   add_arg_slot<ProfilingSettings>(sig, PROFILING);
   if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
@@ -21,13 +22,14 @@ TaskSignature get_sgd_update_signature() {
 }
 
 TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs,
-                          tensor_guid_t const &weight,
-                          non_graph_tensor_guid_t const &sgd_v) {
+                          reduced_tensor_t const &weight,
+                          reduced_tensor_t const &sgd_v) {
   TaskBinding b;
-  b.bind(WEIGHT, TensorGuidSpec{UnifiedTensorGuid{weight}, IsGrad::YES});
-  b.bind(WEIGHT, TensorGuidSpec{UnifiedTensorGuid{weight}, IsGrad::NO});
+  b.bind(WEIGHT, TensorType::FORWARD, weight);
+  b.bind(WEIGHT, TensorType::GRADIENT, weight);
+
   if (attrs.momentum > 0.0f) {
-    b.bind(SGD_V, TensorGuidSpec{UnifiedTensorGuid{sgd_v}, IsGrad::YES});
+    b.bind(SGD_V, TensorType::OPTIMIZER, sgd_v);
   }
   b.bind_arg(ATTRS, attrs);
   b.bind_arg(PROFILING, profiling_settings());
@@ -97,10 +99,11 @@ TaskImplFunction get_sgd_update_task_impl() {
 
 TaskSignature get_adam_update_signature() {
   TaskSignature sig = make_empty_task_signature();
-  add_slot(sig, WEIGHT, IsGrad::YES);
-  add_slot(sig, WEIGHT, IsGrad::NO);
-  add_slot(sig, ADAM_V, IsGrad::YES);
-  add_slot(sig, ADAM_M, IsGrad::YES);
+  add_slot(sig, WEIGHT, TensorType::FORWARD);
+  add_slot(sig, WEIGHT, TensorType::GRADIENT);
+  add_slot(sig, ADAM_V, TensorType::OPTIMIZER);
+  add_slot(sig, ADAM_M, TensorType::OPTIMIZER);
+
   add_arg_slot<AdamOptimizerAttrs>(sig, ATTRS);
   add_arg_slot<ProfilingSettings>(sig, PROFILING);
   if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
@@ -110,14 +113,14 @@ TaskSignature get_adam_update_signature() {
 }
 
 TaskInvocation adam_update(AdamOptimizerAttrs const &attrs,
-                           tensor_guid_t const &weight,
-                           non_graph_tensor_guid_t const &adam_v,
-                           non_graph_tensor_guid_t const &adam_m) {
+                           reduced_tensor_t const &weight,
+                           reduced_tensor_t const &adam_v,
+                           reduced_tensor_t const &adam_m) {
   TaskBinding b;
-  b.bind(WEIGHT, TensorGuidSpec{UnifiedTensorGuid{weight}, IsGrad::YES});
-  b.bind(WEIGHT, TensorGuidSpec{UnifiedTensorGuid{weight}, IsGrad::NO});
-  b.bind(ADAM_M, TensorGuidSpec{UnifiedTensorGuid{adam_m}, IsGrad::YES});
-  b.bind(ADAM_V, TensorGuidSpec{UnifiedTensorGuid{adam_v}, IsGrad::YES});
+  b.bind(WEIGHT, TensorType::FORWARD, weight);
+  b.bind(WEIGHT, TensorType::GRADIENT, weight);
+  b.bind(ADAM_M, TensorType::OPTIMIZER, adam_m);
+  b.bind(ADAM_V, TensorType::OPTIMIZER, adam_v);
   b.bind_arg(ATTRS, attrs);
   b.bind_arg(PROFILING, profiling_settings());
 
@@ -191,8 +194,8 @@ TaskSignature get_update_signature(OptimizerAttrs const &attrs) {
 
 TaskInvocation get_update_invocation(
     OptimizerAttrs const &attrs,
-    tensor_guid_t const &weight,
-    std::vector<non_graph_tensor_guid_t> const &grad_buffer_tensors) {
+    reduced_tensor_t const &weight,
+    std::vector<reduced_tensor_t> const &grad_buffer_tensors) {
   return attrs.visit<TaskInvocation>(overload{
       [&](SGDOptimizerAttrs const &s) {
         return sgd_update(s, weight, grad_buffer_tensors.at(0));
diff --git a/lib/local-execution/src/task_binding.cc b/lib/local-execution/src/task_binding.cc
index 45d9d0cdb9..5261eec217 100644
--- a/lib/local-execution/src/task_binding.cc
+++ b/lib/local-execution/src/task_binding.cc
@@ -4,13 +4,16 @@
 
 namespace FlexFlow {
 
-void TaskBinding::bind(int name, TensorGuidSpec const &tensor_guid_spec) {
-  this->bind(slot_id_t{name}, tensor_guid_spec);
+void TaskBinding::bind(int name,
+                       TensorType const &tensor_type,
+                       reduced_tensor_t const &binding) {
+  this->bind(slot_id_t{name}, tensor_type, binding);
 }
 
-void TaskBinding::bind(slot_id_t name, TensorGuidSpec const &tensor_guid_spec) {
-  this->tensor_bindings.insert(
-      {SlotGradId{name, tensor_guid_spec.is_grad}, tensor_guid_spec});
+void TaskBinding::bind(slot_id_t name,
+                       TensorType const &tensor_type,
+                       reduced_tensor_t const &binding) {
+  this->tensor_bindings.insert({SlotTensorTypeId{name, tensor_type}, binding});
 }
 
 void TaskBinding::insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec) {
@@ -26,13 +29,13 @@ bool TaskBinding::operator!=(TaskBinding const &other) const {
   return this->tie() != other.tie();
 }
 
-std::tuple<std::unordered_map<SlotGradId, TensorGuidSpec> const &,
+std::tuple<std::unordered_map<SlotTensorTypeId, reduced_tensor_t> const &,
            std::unordered_map<slot_id_t, TaskArgSpec> const &>
     TaskBinding::tie() const {
   return std::tie(this->tensor_bindings, this->arg_bindings);
 }
 
-std::unordered_map<SlotGradId, TensorGuidSpec> const &
+std::unordered_map<SlotTensorTypeId, reduced_tensor_t> const &
     TaskBinding::get_tensor_bindings() const {
   return this->tensor_bindings;
 }
diff --git a/lib/local-execution/src/task_signature.cc b/lib/local-execution/src/task_signature.cc
index 27bcbcd266..a608ab8ab8 100644
--- a/lib/local-execution/src/task_signature.cc
+++ b/lib/local-execution/src/task_signature.cc
@@ -8,17 +8,17 @@ TaskSignature make_empty_task_signature() {
 
 void add_slot(TaskSignature &task_signature,
               int name,
-              IsGrad is_grad,
+              TensorType tensor_type,
               SlotType slot_type) {
-  add_slot(task_signature, slot_id_t{name}, is_grad, slot_type);
+  add_slot(task_signature, slot_id_t{name}, tensor_type, slot_type);
 }
 
 void add_slot(TaskSignature &task_signature,
               slot_id_t name,
-              IsGrad is_grad,
+              TensorType tensor_type,
               SlotType slot_type) {
-  TensorGuidSlotSpec tensor_guid_slot_spec =
-      TensorGuidSlotSpec{slot_type, is_grad};
+  TensorTypeSlotSpec tensor_guid_slot_spec =
+      TensorTypeSlotSpec{slot_type, tensor_type};
   task_signature.tensor_guid_slots.insert({name, tensor_guid_slot_spec});
 }
 
diff --git a/lib/local-execution/src/tensor_reduction.cc b/lib/local-execution/src/tensor_reduction.cc
new file mode 100644
index 0000000000..19324509bb
--- /dev/null
+++ b/lib/local-execution/src/tensor_reduction.cc
@@ -0,0 +1,17 @@
+#include "local-execution/tensor_reduction.h"
+#include "utils/containers/transform.h"
+
+namespace FlexFlow {
+
+reduced_tensor_t lower(tensor_guid_t const &tensor_guid) {
+  return reduced_tensor_t{tensor_guid.raw_graph_output.idx};
+}
+
+std::vector<reduced_tensor_t>
+    lower(std::vector<tensor_guid_t> const &tensor_guids) {
+  return transform(tensor_guids, [&](tensor_guid_t const &tensor_guid) {
+    return lower(tensor_guid);
+  });
+}
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/include/op-attrs/operator_attrs.h b/lib/op-attrs/include/op-attrs/operator_attrs.h
index 11afc5b209..73473d6ac5 100644
--- a/lib/op-attrs/include/op-attrs/operator_attrs.h
+++ b/lib/op-attrs/include/op-attrs/operator_attrs.h
@@ -1,8 +1,6 @@
 #ifndef _OPERATOR_PARAMS_H
 #define _OPERATOR_PARAMS_H
 
-#include "op-attrs/ops/core.h"
-#include "op-attrs/pcg_operator_attrs.dtg.h"
 #include "local-execution/ops/attention.h"
 #include "local-execution/ops/batch_matmul.h"
 #include "local-execution/ops/batch_norm.h"
@@ -32,6 +30,8 @@
 #include "local-execution/ops/split.h"
 #include "local-execution/ops/topk.h"
 #include "local-execution/ops/transpose.h"
+#include "op-attrs/ops/core.h"
+#include "op-attrs/pcg_operator_attrs.dtg.h"
 #include "utils/record_formatter.h"
 #include "utils/variant.h"
 #include <variant>

From 0cdfb1a7edd9ea283f678f06950054a701be8600 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Tue, 7 Jan 2025 15:14:49 -0800
Subject: [PATCH 23/91] Fixes

---
 lib/kernels/include/kernels/array_shape.h     |  2 -
 lib/kernels/include/kernels/legion_dim.h      |  3 -
 lib/kernels/src/array_shape.cc                | 19 +++---
 lib/kernels/src/cuda/ops/concat_kernels.cu    |  4 +-
 lib/kernels/src/legion_dim.cc                 |  9 ---
 .../include/local-execution/arg_ref.h         |  2 +-
 .../include/local-execution/concrete_arg.h    |  6 +-
 .../layer_tensor_key.struct.toml              |  2 -
 .../local-execution/local_training_backing.h  |  2 +-
 .../include/local-execution/runtime_arg_ref.h | 12 ----
 .../include/local-execution/task_binding.h    |  8 +--
 .../include/local-execution/task_registry.h   |  2 +-
 .../{ => local-execution}/ops/attention.cc    |  0
 .../{ => local-execution}/ops/batch_matmul.cc |  0
 .../{ => local-execution}/ops/batch_norm.cc   |  0
 .../src/{ => local-execution}/ops/cast.cc     |  0
 .../src/{ => local-execution}/ops/combine.cc  |  0
 .../src/{ => local-execution}/ops/concat.cc   |  0
 .../src/{ => local-execution}/ops/conv_2d.cc  |  0
 .../src/{ => local-execution}/ops/dropout.cc  |  0
 .../ops/element_binary.cc                     |  0
 .../ops/element_unary.cc                      |  0
 .../src/{ => local-execution}/ops/flat.cc     |  0
 .../src/{ => local-execution}/ops/gather.cc   |  0
 .../src/{ => local-execution}/ops/input.cc    |  0
 .../{ => local-execution}/ops/layer_norm.cc   |  0
 .../src/{ => local-execution}/ops/linear.cc   |  0
 .../src/{ => local-execution}/ops/noop.cc     |  0
 .../src/{ => local-execution}/ops/pool_2d.cc  |  0
 .../src/{ => local-execution}/ops/reduce.cc   |  0
 .../{ => local-execution}/ops/reduction.cc    |  0
 .../{ => local-execution}/ops/repartition.cc  |  0
 .../{ => local-execution}/ops/replicate.cc    |  0
 .../src/{ => local-execution}/ops/reshape.cc  |  0
 .../src/{ => local-execution}/ops/reverse.cc  |  0
 .../src/{ => local-execution}/ops/softmax.cc  |  0
 .../src/{ => local-execution}/ops/split.cc    |  0
 .../src/{ => local-execution}/ops/topk.cc     |  0
 .../{ => local-execution}/ops/transpose.cc    |  0
 .../src/{ => local-execution}/ops/weight.cc   |  0
 .../src/local_cost_estimator.cc               | 22 ++-----
 .../src/local_training_backing.cc             | 13 +++--
 .../src/model_training_instance.cc            |  2 +-
 lib/local-execution/src/task_binding.cc       | 13 +++++
 lib/local-execution/src/task_registry.cc      |  2 +-
 lib/local-execution/src/task_signature.cc     |  2 +-
 lib/local-execution/src/tensor_reduction.cc   |  2 +-
 .../test/src/test_local_slots_backing.cc      | 47 ++++++++-------
 .../test/src/test_local_task_arg_accessor.cc  | 56 +++++++++---------
 lib/local-execution/test/src/test_loss_e2e.cc | 35 +++++------
 .../test/src/test_update_e2e.cc               |  9 +--
 .../include/op-attrs/operator_attrs.h         | 58 +++++++++----------
 lib/pcg/include/pcg/optimizer_attrs.h         |  2 +-
 lib/pcg/src/pcg/optimizer_attrs.cc            |  2 +-
 54 files changed, 154 insertions(+), 182 deletions(-)
 rename lib/local-execution/src/{ => local-execution}/ops/attention.cc (100%)
 rename lib/local-execution/src/{ => local-execution}/ops/batch_matmul.cc (100%)
 rename lib/local-execution/src/{ => local-execution}/ops/batch_norm.cc (100%)
 rename lib/local-execution/src/{ => local-execution}/ops/cast.cc (100%)
 rename lib/local-execution/src/{ => local-execution}/ops/combine.cc (100%)
 rename lib/local-execution/src/{ => local-execution}/ops/concat.cc (100%)
 rename lib/local-execution/src/{ => local-execution}/ops/conv_2d.cc (100%)
 rename lib/local-execution/src/{ => local-execution}/ops/dropout.cc (100%)
 rename lib/local-execution/src/{ => local-execution}/ops/element_binary.cc (100%)
 rename lib/local-execution/src/{ => local-execution}/ops/element_unary.cc (100%)
 rename lib/local-execution/src/{ => local-execution}/ops/flat.cc (100%)
 rename lib/local-execution/src/{ => local-execution}/ops/gather.cc (100%)
 rename lib/local-execution/src/{ => local-execution}/ops/input.cc (100%)
 rename lib/local-execution/src/{ => local-execution}/ops/layer_norm.cc (100%)
 rename lib/local-execution/src/{ => local-execution}/ops/linear.cc (100%)
 rename lib/local-execution/src/{ => local-execution}/ops/noop.cc (100%)
 rename lib/local-execution/src/{ => local-execution}/ops/pool_2d.cc (100%)
 rename lib/local-execution/src/{ => local-execution}/ops/reduce.cc (100%)
 rename lib/local-execution/src/{ => local-execution}/ops/reduction.cc (100%)
 rename lib/local-execution/src/{ => local-execution}/ops/repartition.cc (100%)
 rename lib/local-execution/src/{ => local-execution}/ops/replicate.cc (100%)
 rename lib/local-execution/src/{ => local-execution}/ops/reshape.cc (100%)
 rename lib/local-execution/src/{ => local-execution}/ops/reverse.cc (100%)
 rename lib/local-execution/src/{ => local-execution}/ops/softmax.cc (100%)
 rename lib/local-execution/src/{ => local-execution}/ops/split.cc (100%)
 rename lib/local-execution/src/{ => local-execution}/ops/topk.cc (100%)
 rename lib/local-execution/src/{ => local-execution}/ops/transpose.cc (100%)
 rename lib/local-execution/src/{ => local-execution}/ops/weight.cc (100%)

diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h
index fd66697793..bc3ca34e6a 100644
--- a/lib/kernels/include/kernels/array_shape.h
+++ b/lib/kernels/include/kernels/array_shape.h
@@ -43,8 +43,6 @@ struct ArrayShape {
   std::optional<std::size_t> at_maybe(legion_dim_t) const;
   std::optional<std::size_t> at_maybe(ff_dim_t) const;
 
-  ArrayShape sub_shape(legion_dim_t start, ff_dim_t end) const;
-
   ArrayShape sub_shape(std::optional<ff_dim_t> start,
                        std::optional<ff_dim_t> end) const;
 
diff --git a/lib/kernels/include/kernels/legion_dim.h b/lib/kernels/include/kernels/legion_dim.h
index 29c5e29a93..e4dd9723b8 100644
--- a/lib/kernels/include/kernels/legion_dim.h
+++ b/lib/kernels/include/kernels/legion_dim.h
@@ -10,9 +10,6 @@ legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value);
 
 legion_dim_t legion_dim_from_ff_dim(ff_dim_t, int num_dimensions);
 
-std::optional<legion_dim_t> legion_dim_from_ff_dim(std::optional<ff_dim_t>,
-                                                   int num_dimensions);
-
 template <typename T>
 using LegionOrdered = DimOrdered<legion_dim_t, T>;
 
diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
index 31ee7b6001..eb2b88b203 100644
--- a/lib/kernels/src/array_shape.cc
+++ b/lib/kernels/src/array_shape.cc
@@ -1,6 +1,7 @@
 #include "kernels/array_shape.h"
 #include "op-attrs/dim_ordered/slice.h"
 #include "utils/containers/product.h"
+#include "utils/containers/transform.h"
 
 namespace FlexFlow {
 
@@ -54,17 +55,17 @@ std::size_t ArrayShape::at(ff_dim_t idx) const {
   return dims.at(legion_dim_from_ff_dim(idx, this->num_dims()));
 }
 
-ArrayShape ArrayShape::sub_shape(legion_dim_t start, ff_dim_t end) const {
-  legion_dim_t legion_end = legion_dim_from_ff_dim(end, num_dims());
-  return this->sub_shape(start, legion_end);
-}
-
 ArrayShape ArrayShape::sub_shape(std::optional<ff_dim_t> start,
                                  std::optional<ff_dim_t> end) const {
-  std::optional<legion_dim_t> legion_start =
-      legion_dim_from_ff_dim(start, num_dims());
-  std::optional<legion_dim_t> legion_end =
-      legion_dim_from_ff_dim(end, num_dims());
+  std::optional<legion_dim_t> legion_start = transform(
+    start, [&](auto const &start_unwrapped) {
+      return legion_dim_from_ff_dim(start_unwrapped, num_dims());
+  });
+
+  std::optional<legion_dim_t> legion_end = transform(
+    end, [&](auto const &end_unwrapped) {
+      return legion_dim_from_ff_dim(end_unwrapped, num_dims());
+  });
   return this->sub_shape(legion_start, legion_end);
 }
 
diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu
index 68004738d2..ce6178c7cc 100644
--- a/lib/kernels/src/cuda/ops/concat_kernels.cu
+++ b/lib/kernels/src/cuda/ops/concat_kernels.cu
@@ -15,6 +15,7 @@
 
 #include "device.h"
 #include "kernels/concat_kernels.h"
+#include "kernels/legion_dim.h"
 #include <cassert>
 
 namespace FlexFlow {
@@ -25,7 +26,8 @@ void calc_blk_size(size_t &num_blocks,
                    size_t &blk_size,
                    ArrayShape const &shape,
                    ff_dim_t axis) {
-  blk_size = shape.sub_shape(legion_dim_t{0}, axis).num_elements();
+  legion_dim_t axis_legion_dim = legion_dim_from_ff_dim(axis, shape.num_dims());
+  blk_size = shape.sub_shape(legion_dim_t{0}, axis_legion_dim).num_elements();
   num_blocks = shape.sub_shape(axis, std::nullopt).num_elements();
 }
 
diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/legion_dim.cc
index c190a02220..9ef47d40ae 100644
--- a/lib/kernels/src/legion_dim.cc
+++ b/lib/kernels/src/legion_dim.cc
@@ -10,13 +10,4 @@ legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, int num_dimensions) {
   return legion_dim_t(num_dimensions - ff_dim.value - 1);
 }
 
-std::optional<legion_dim_t>
-    legion_dim_from_ff_dim(std::optional<ff_dim_t> ff_dim, int num_dimensions) {
-  if (ff_dim.has_value()) {
-    return legion_dim_from_ff_dim(ff_dim.value(), num_dimensions);
-  } else {
-    return std::nullopt;
-  }
-}
-
 } // namespace FlexFlow
diff --git a/lib/local-execution/include/local-execution/arg_ref.h b/lib/local-execution/include/local-execution/arg_ref.h
index 30da405c13..75eecda273 100644
--- a/lib/local-execution/include/local-execution/arg_ref.h
+++ b/lib/local-execution/include/local-execution/arg_ref.h
@@ -82,7 +82,7 @@ template <typename LABEL_TYPE>
 struct hash<::FlexFlow::ArgRefSpec<LABEL_TYPE>> {
   size_t operator()(::FlexFlow::ArgRefSpec<LABEL_TYPE> const &s) const {
     size_t result = 0;
-    ::FlexFlow::hash_combine(result, s.type_idx);
+    ::FlexFlow::hash_combine(result, s.type_idx, s.get_ref_type());
     return result;
   }
 };
diff --git a/lib/local-execution/include/local-execution/concrete_arg.h b/lib/local-execution/include/local-execution/concrete_arg.h
index ac5d97f3c4..cee52ba4a2 100644
--- a/lib/local-execution/include/local-execution/concrete_arg.h
+++ b/lib/local-execution/include/local-execution/concrete_arg.h
@@ -24,6 +24,10 @@ struct ConcreteArgSpec {
     return this->type_idx;
   }
 
+  std::shared_ptr<void const> get_ptr() const {
+    return this->ptr;
+  }
+
   bool operator==(ConcreteArgSpec const &other) const;
   bool operator!=(ConcreteArgSpec const &other) const;
 
@@ -60,7 +64,7 @@ template <>
 struct hash<::FlexFlow::ConcreteArgSpec> {
   size_t operator()(::FlexFlow::ConcreteArgSpec const &s) const {
     size_t result = 0;
-    ::FlexFlow::hash_combine(result, s.get_type_index());
+    ::FlexFlow::hash_combine(result, s.get_type_index(), s.get_ptr());
     return result;
   }
 };
diff --git a/lib/local-execution/include/local-execution/layer_tensor_key.struct.toml b/lib/local-execution/include/local-execution/layer_tensor_key.struct.toml
index 3ec6d7b0f1..33a7a9174f 100644
--- a/lib/local-execution/include/local-execution/layer_tensor_key.struct.toml
+++ b/lib/local-execution/include/local-execution/layer_tensor_key.struct.toml
@@ -4,8 +4,6 @@ features = [
   "eq",
   "ord",
   "hash",
-  "json",
-  "rapidcheck",
   "fmt",
 ]
 
diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h
index cbab4bf031..26ebfbe3c4 100644
--- a/lib/local-execution/include/local-execution/local_training_backing.h
+++ b/lib/local-execution/include/local-execution/local_training_backing.h
@@ -35,6 +35,7 @@ struct LocalTrainingBacking {
                             std::optional<layer_guid_t> const &) const;
   TaskArgumentAccessor get_op_task_arg_accessor(OpTaskInvocation const &,
                                                 layer_guid_t const &) const;
+  LocalSlotsBacking local_slots_backing;
 
 private:
   DeviceSpecificDeviceStates call_init_task_impl(task_id_t,
@@ -45,7 +46,6 @@ struct LocalTrainingBacking {
   Allocator allocator;
   ComputationGraph computation_graph;
   TaskRegistry task_registry;
-  LocalSlotsBacking local_slots_backing;
 };
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/include/local-execution/runtime_arg_ref.h b/lib/local-execution/include/local-execution/runtime_arg_ref.h
index fd79e23126..a225a813df 100644
--- a/lib/local-execution/include/local-execution/runtime_arg_ref.h
+++ b/lib/local-execution/include/local-execution/runtime_arg_ref.h
@@ -27,18 +27,6 @@ RuntimeArgRef<ProfilingSettings> profiling_settings();
 RuntimeArgRef<DeviceSpecific<PerDeviceFFHandle>> ff_handle();
 RuntimeArgRef<FFIterationConfig> iteration_config();
 
-// std::string format_as(RuntimeArgRefSpec const & x) {
-//   std::ostringstream oss;
-//   oss << "<RuntimeArgRefSpec";
-//   oss << " type_idx=" << x.get_type_index().name();
-//   oss << ">";
-//   return oss.str();
-// }
-
-// std::ostream &operator<<(std::ostream & s, RuntimeArgRefSpec const & x) {
-//   return (s << fmt::to_string(x));
-// }
-
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/local-execution/include/local-execution/task_binding.h b/lib/local-execution/include/local-execution/task_binding.h
index 93461e2e55..e211592ea6 100644
--- a/lib/local-execution/include/local-execution/task_binding.h
+++ b/lib/local-execution/include/local-execution/task_binding.h
@@ -7,7 +7,6 @@
 #include "local-execution/task_arg_spec.dtg.h"
 #include "local-execution/task_id_t.dtg.h"
 #include "local-execution/task_signature.dtg.h"
-#include "utils/hash/unordered_map.h"
 
 namespace FlexFlow {
 
@@ -63,12 +62,7 @@ namespace std {
 
 template <>
 struct hash<::FlexFlow::TaskBinding> {
-  size_t operator()(::FlexFlow::TaskBinding const &s) const {
-    size_t result = 0;
-    hash_combine(result, s.get_tensor_bindings());
-    hash_combine(result, s.get_arg_bindings());
-    return result;
-  }
+  size_t operator()(::FlexFlow::TaskBinding const &s) const;
 };
 
 } // namespace std
diff --git a/lib/local-execution/include/local-execution/task_registry.h b/lib/local-execution/include/local-execution/task_registry.h
index 24790a28e3..fa3e558337 100644
--- a/lib/local-execution/include/local-execution/task_registry.h
+++ b/lib/local-execution/include/local-execution/task_registry.h
@@ -14,7 +14,7 @@ void register_tasks_for_layer(TaskRegistry &,
                               layer_guid_t const &,
                               ComputationGraphOpAttrs const &attrs);
 
-bool registry_contains_op_task(TaskRegistry const &,
+bool registry_contains_task_for_layer(TaskRegistry const &,
                                layer_guid_t const &,
                                OpTaskType const &);
 
diff --git a/lib/local-execution/src/ops/attention.cc b/lib/local-execution/src/local-execution/ops/attention.cc
similarity index 100%
rename from lib/local-execution/src/ops/attention.cc
rename to lib/local-execution/src/local-execution/ops/attention.cc
diff --git a/lib/local-execution/src/ops/batch_matmul.cc b/lib/local-execution/src/local-execution/ops/batch_matmul.cc
similarity index 100%
rename from lib/local-execution/src/ops/batch_matmul.cc
rename to lib/local-execution/src/local-execution/ops/batch_matmul.cc
diff --git a/lib/local-execution/src/ops/batch_norm.cc b/lib/local-execution/src/local-execution/ops/batch_norm.cc
similarity index 100%
rename from lib/local-execution/src/ops/batch_norm.cc
rename to lib/local-execution/src/local-execution/ops/batch_norm.cc
diff --git a/lib/local-execution/src/ops/cast.cc b/lib/local-execution/src/local-execution/ops/cast.cc
similarity index 100%
rename from lib/local-execution/src/ops/cast.cc
rename to lib/local-execution/src/local-execution/ops/cast.cc
diff --git a/lib/local-execution/src/ops/combine.cc b/lib/local-execution/src/local-execution/ops/combine.cc
similarity index 100%
rename from lib/local-execution/src/ops/combine.cc
rename to lib/local-execution/src/local-execution/ops/combine.cc
diff --git a/lib/local-execution/src/ops/concat.cc b/lib/local-execution/src/local-execution/ops/concat.cc
similarity index 100%
rename from lib/local-execution/src/ops/concat.cc
rename to lib/local-execution/src/local-execution/ops/concat.cc
diff --git a/lib/local-execution/src/ops/conv_2d.cc b/lib/local-execution/src/local-execution/ops/conv_2d.cc
similarity index 100%
rename from lib/local-execution/src/ops/conv_2d.cc
rename to lib/local-execution/src/local-execution/ops/conv_2d.cc
diff --git a/lib/local-execution/src/ops/dropout.cc b/lib/local-execution/src/local-execution/ops/dropout.cc
similarity index 100%
rename from lib/local-execution/src/ops/dropout.cc
rename to lib/local-execution/src/local-execution/ops/dropout.cc
diff --git a/lib/local-execution/src/ops/element_binary.cc b/lib/local-execution/src/local-execution/ops/element_binary.cc
similarity index 100%
rename from lib/local-execution/src/ops/element_binary.cc
rename to lib/local-execution/src/local-execution/ops/element_binary.cc
diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/local-execution/ops/element_unary.cc
similarity index 100%
rename from lib/local-execution/src/ops/element_unary.cc
rename to lib/local-execution/src/local-execution/ops/element_unary.cc
diff --git a/lib/local-execution/src/ops/flat.cc b/lib/local-execution/src/local-execution/ops/flat.cc
similarity index 100%
rename from lib/local-execution/src/ops/flat.cc
rename to lib/local-execution/src/local-execution/ops/flat.cc
diff --git a/lib/local-execution/src/ops/gather.cc b/lib/local-execution/src/local-execution/ops/gather.cc
similarity index 100%
rename from lib/local-execution/src/ops/gather.cc
rename to lib/local-execution/src/local-execution/ops/gather.cc
diff --git a/lib/local-execution/src/ops/input.cc b/lib/local-execution/src/local-execution/ops/input.cc
similarity index 100%
rename from lib/local-execution/src/ops/input.cc
rename to lib/local-execution/src/local-execution/ops/input.cc
diff --git a/lib/local-execution/src/ops/layer_norm.cc b/lib/local-execution/src/local-execution/ops/layer_norm.cc
similarity index 100%
rename from lib/local-execution/src/ops/layer_norm.cc
rename to lib/local-execution/src/local-execution/ops/layer_norm.cc
diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/local-execution/ops/linear.cc
similarity index 100%
rename from lib/local-execution/src/ops/linear.cc
rename to lib/local-execution/src/local-execution/ops/linear.cc
diff --git a/lib/local-execution/src/ops/noop.cc b/lib/local-execution/src/local-execution/ops/noop.cc
similarity index 100%
rename from lib/local-execution/src/ops/noop.cc
rename to lib/local-execution/src/local-execution/ops/noop.cc
diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/local-execution/ops/pool_2d.cc
similarity index 100%
rename from lib/local-execution/src/ops/pool_2d.cc
rename to lib/local-execution/src/local-execution/ops/pool_2d.cc
diff --git a/lib/local-execution/src/ops/reduce.cc b/lib/local-execution/src/local-execution/ops/reduce.cc
similarity index 100%
rename from lib/local-execution/src/ops/reduce.cc
rename to lib/local-execution/src/local-execution/ops/reduce.cc
diff --git a/lib/local-execution/src/ops/reduction.cc b/lib/local-execution/src/local-execution/ops/reduction.cc
similarity index 100%
rename from lib/local-execution/src/ops/reduction.cc
rename to lib/local-execution/src/local-execution/ops/reduction.cc
diff --git a/lib/local-execution/src/ops/repartition.cc b/lib/local-execution/src/local-execution/ops/repartition.cc
similarity index 100%
rename from lib/local-execution/src/ops/repartition.cc
rename to lib/local-execution/src/local-execution/ops/repartition.cc
diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/local-execution/ops/replicate.cc
similarity index 100%
rename from lib/local-execution/src/ops/replicate.cc
rename to lib/local-execution/src/local-execution/ops/replicate.cc
diff --git a/lib/local-execution/src/ops/reshape.cc b/lib/local-execution/src/local-execution/ops/reshape.cc
similarity index 100%
rename from lib/local-execution/src/ops/reshape.cc
rename to lib/local-execution/src/local-execution/ops/reshape.cc
diff --git a/lib/local-execution/src/ops/reverse.cc b/lib/local-execution/src/local-execution/ops/reverse.cc
similarity index 100%
rename from lib/local-execution/src/ops/reverse.cc
rename to lib/local-execution/src/local-execution/ops/reverse.cc
diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/local-execution/ops/softmax.cc
similarity index 100%
rename from lib/local-execution/src/ops/softmax.cc
rename to lib/local-execution/src/local-execution/ops/softmax.cc
diff --git a/lib/local-execution/src/ops/split.cc b/lib/local-execution/src/local-execution/ops/split.cc
similarity index 100%
rename from lib/local-execution/src/ops/split.cc
rename to lib/local-execution/src/local-execution/ops/split.cc
diff --git a/lib/local-execution/src/ops/topk.cc b/lib/local-execution/src/local-execution/ops/topk.cc
similarity index 100%
rename from lib/local-execution/src/ops/topk.cc
rename to lib/local-execution/src/local-execution/ops/topk.cc
diff --git a/lib/local-execution/src/ops/transpose.cc b/lib/local-execution/src/local-execution/ops/transpose.cc
similarity index 100%
rename from lib/local-execution/src/ops/transpose.cc
rename to lib/local-execution/src/local-execution/ops/transpose.cc
diff --git a/lib/local-execution/src/ops/weight.cc b/lib/local-execution/src/local-execution/ops/weight.cc
similarity index 100%
rename from lib/local-execution/src/ops/weight.cc
rename to lib/local-execution/src/local-execution/ops/weight.cc
diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
index c99a2b154f..02265281b0 100644
--- a/lib/local-execution/src/local_cost_estimator.cc
+++ b/lib/local-execution/src/local_cost_estimator.cc
@@ -1,4 +1,5 @@
 #include "local-execution/local_cost_estimator.h"
+#include "local-execution/tensor_reduction.h"
 #include "kernels/device.h"
 #include "kernels/local_cuda_allocator.h"
 #include "local-execution/tracked_allocator.h"
@@ -8,21 +9,11 @@
 #include "pcg/computation_graph_builder.h"
 #include "pcg/parallel_tensor_attrs.h"
 #include "utils/containers/transform.h"
+#include "utils/containers/values.h"
+#include "utils/containers/sum.h"
 
 namespace FlexFlow {
 
-static float get_total_elapsed_time(PerLayerElapsedTime const &fwd,
-                                    PerLayerElapsedTime const &bwd) {
-  float total_elapsed_time = 0;
-  for (auto const &layer_elapsed_time : fwd) {
-    layer_guid_t layer_id = layer_elapsed_time.first;
-    float fwd_time = layer_elapsed_time.second.value();
-    float bwd_time = bwd.at(layer_id).value();
-    total_elapsed_time += fwd_time + bwd_time;
-  }
-  return total_elapsed_time;
-}
-
 LocalCostEstimator::LocalCostEstimator(RuntimeArgConfig const &config)
     : runtime_arg_config(config) {}
 
@@ -45,7 +36,6 @@ CostDetails LocalCostEstimator::estimate_cost(
   std::shared_ptr<TrackedAllocator> tracked_allocator_ptr =
       std::make_shared<TrackedAllocator>(create_local_cuda_memory_allocator());
   Allocator allocator = Allocator(tracked_allocator_ptr);
-  TensorBackingMap tensor_backing_map;
   std::vector<tensor_guid_t> input_tensor_ids;
 
   ComputationGraphBuilder cg_builder;
@@ -53,9 +43,6 @@ CostDetails LocalCostEstimator::estimate_cost(
     TensorShape tensor_shape = get_piece_shape(input);
     tensor_guid_t tensor_id =
         cg_builder.create_input(tensor_shape, CreateGrad::YES);
-    GenericTensorAccessorW tensor_backing =
-        allocator.allocate_tensor(tensor_shape);
-    tensor_backing_map.insert({tensor_id, tensor_backing});
     input_tensor_ids.push_back(tensor_id);
   }
 
@@ -79,7 +66,8 @@ CostDetails LocalCostEstimator::estimate_cost(
 
   LocalTrainingBacking local_backing(allocator,
                                      cg_builder.computation_graph,
-                                     tensor_backing_map,
+                                     LayerTensorBackingMap{},
+                                     TensorBackingMap{},
                                      this->runtime_arg_config);
   local_backing.register_and_allocate_layer(layer_added_result.layer);
   local_backing.execute_init(layer_added_result.layer);
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index e432b1afe9..f02a8c7824 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -68,7 +68,7 @@ std::optional<float>
 }
 
 void LocalTrainingBacking::execute_init(layer_guid_t const &operator_node) {
-  if (registry_contains_op_task(
+  if (registry_contains_task_for_layer(
           this->task_registry, operator_node, OpTaskType::INIT)) {
     ComputationGraphOpAttrs attrs =
         get_layer_attrs(this->computation_graph, operator_node).attrs;
@@ -85,7 +85,7 @@ void LocalTrainingBacking::execute_init(layer_guid_t const &operator_node) {
 
 std::optional<float>
     LocalTrainingBacking::execute_forward(layer_guid_t const &operator_node) {
-  if (registry_contains_op_task(
+  if (registry_contains_task_for_layer(
           this->task_registry, operator_node, OpTaskType::FWD)) {
     ComputationGraphOpAttrs attrs =
         get_layer_attrs(this->computation_graph, operator_node).attrs;
@@ -102,11 +102,10 @@ std::optional<float>
 void LocalTrainingBacking::compute_loss(LossAttrs const &loss_attrs,
                                         reduced_tensor_t const &logit_tensor,
                                         reduced_tensor_t const &label_tensor) {
-  assert(
-      this->local_slots_backing.is_non_graph_tensor_allocated(logit_tensor) &&
-      this->local_slots_backing.is_non_graph_tensor_allocated(label_tensor));
+  assert(this->local_slots_backing.is_non_graph_tensor_allocated(label_tensor));
   TaskInvocation loss_invocation =
       backward(loss_attrs, logit_tensor, label_tensor);
+  // TODO: https://github.com/flexflow/flexflow-train/issues/1442
   // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation));
   TaskArgumentAccessor loss_accessor =
       this->get_task_arg_accessor(loss_invocation, std::nullopt);
@@ -116,7 +115,7 @@ void LocalTrainingBacking::compute_loss(LossAttrs const &loss_attrs,
 
 std::optional<float>
     LocalTrainingBacking::execute_backward(layer_guid_t const &operator_node) {
-  if (registry_contains_op_task(
+  if (registry_contains_task_for_layer(
           this->task_registry, operator_node, OpTaskType::BWD)) {
     ComputationGraphOpAttrs attrs =
         get_layer_attrs(this->computation_graph, operator_node).attrs;
@@ -143,6 +142,8 @@ void LocalTrainingBacking::execute_update(
     // get invocation
     TaskInvocation invocation = get_update_invocation(
         optimizer_attrs, weight_tensor, optimizer_buffer_tensors);
+
+    // TODO: https://github.com/flexflow/flexflow-train/issues/1442
     // assert(is_invocation_valid(get_update_signature(attrs), invocation));
 
     // execute update
diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc
index 5a58e4c524..4815de5e85 100644
--- a/lib/local-execution/src/model_training_instance.cc
+++ b/lib/local-execution/src/model_training_instance.cc
@@ -71,7 +71,7 @@ void ModelTrainingInstance::execute_update() {
     this->training_backing.execute_update(node, this->optimizer_attrs);
   }
   this->optimizer_attrs =
-      get_next_iteration_optimizer_attrs(this->optimizer_attrs);
+      get_optimizer_attrs_for_next_iter(this->optimizer_attrs);
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/task_binding.cc b/lib/local-execution/src/task_binding.cc
index 5261eec217..2b1256df90 100644
--- a/lib/local-execution/src/task_binding.cc
+++ b/lib/local-execution/src/task_binding.cc
@@ -1,6 +1,7 @@
 #include "local-execution/task_binding.h"
 #include "utils/containers/contains_key.h"
 #include "utils/fmt/unordered_map.h"
+#include "utils/hash/unordered_map.h"
 
 namespace FlexFlow {
 
@@ -58,3 +59,15 @@ std::ostream &operator<<(std::ostream &s, TaskBinding const &x) {
 }
 
 } // namespace FlexFlow
+
+namespace std {
+
+size_t hash<::FlexFlow::TaskBinding>::operator() (
+  ::FlexFlow::TaskBinding const &s) const {
+    size_t result = 0;
+    hash_combine(result, s.get_tensor_bindings());
+    hash_combine(result, s.get_arg_bindings());
+    return result;
+  }
+
+} // namespace std
diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc
index 3cd2cccae8..be1cf73e11 100644
--- a/lib/local-execution/src/task_registry.cc
+++ b/lib/local-execution/src/task_registry.cc
@@ -42,7 +42,7 @@ void register_tasks_for_layer(TaskRegistry &task_registry,
   }
 }
 
-bool registry_contains_op_task(TaskRegistry const &task_registry,
+bool registry_contains_task_for_layer(TaskRegistry const &task_registry,
                                layer_guid_t const &op,
                                OpTaskType const &op_task_type) {
   std::unordered_map<layer_guid_t, std::optional<task_id_t>> task_ids;
diff --git a/lib/local-execution/src/task_signature.cc b/lib/local-execution/src/task_signature.cc
index a608ab8ab8..1d57a1fc54 100644
--- a/lib/local-execution/src/task_signature.cc
+++ b/lib/local-execution/src/task_signature.cc
@@ -18,7 +18,7 @@ void add_slot(TaskSignature &task_signature,
               TensorType tensor_type,
               SlotType slot_type) {
   TensorTypeSlotSpec tensor_guid_slot_spec =
-      TensorTypeSlotSpec{slot_type, tensor_type};
+      TensorTypeSlotSpec{name, tensor_type, slot_type};
   task_signature.tensor_guid_slots.insert({name, tensor_guid_slot_spec});
 }
 
diff --git a/lib/local-execution/src/tensor_reduction.cc b/lib/local-execution/src/tensor_reduction.cc
index 19324509bb..1d0cb7a2e9 100644
--- a/lib/local-execution/src/tensor_reduction.cc
+++ b/lib/local-execution/src/tensor_reduction.cc
@@ -4,7 +4,7 @@
 namespace FlexFlow {
 
 reduced_tensor_t lower(tensor_guid_t const &tensor_guid) {
-  return reduced_tensor_t{tensor_guid.raw_graph_output.idx};
+  return reduced_tensor_t{tensor_guid.raw_graph_output.node.raw_uid};
 }
 
 std::vector<reduced_tensor_t>
diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc
index 5d58e7e757..c9e95fe444 100644
--- a/lib/local-execution/test/src/test_local_slots_backing.cc
+++ b/lib/local-execution/test/src/test_local_slots_backing.cc
@@ -2,10 +2,12 @@
 #include "local-execution/local_cost_estimator.h"
 #include "local-execution/local_cpu_allocator.h"
 #include "local-execution/local_slots_backing.h"
+#include "local-execution/tensor_reduction.h"
 #include "op-attrs/ops/attention.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "pcg/computation_graph.h"
 #include "pcg/computation_graph_builder.h"
+#include "utils/containers/get_only.h"
 #include "test/utils/doctest/fmt/pair.h"
 #include "test/utils/doctest/fmt/unordered_map.h"
 #include "test/utils/doctest/fmt/variant.h"
@@ -66,8 +68,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     layer_guid_t layer_guid =
         get_layer_by_name(cg_builder.computation_graph, layer_name);
 
-    TensorBackingMap tensor_backing_map = {
-        {query_guid, query}, {key_guid, key}, {value_guid, value}};
+    LayerTensorBackingMap layer_tensor_backing_map = {
+      {LayerTensorKey{layer_guid, lower(query_guid)}, query}, 
+      {LayerTensorKey{layer_guid, lower(key_guid)}, key}, 
+      {LayerTensorKey{layer_guid, lower(value_guid)}, value},
+      //{LayerTensorKey{layer_guid, lower(output_guid), output}}
+    };
 
     // runtime arg config
     ProfilingSettings settings = ProfilingSettings{/*warmup_iters=*/0,
@@ -78,14 +84,15 @@ TEST_SUITE(FF_TEST_SUITE) {
                          EnableProfiling::NO,
                          settings};
 
-    LocalSlotsBacking local_slots_backing = {tensor_backing_map,
+    LocalSlotsBacking local_slots_backing = {layer_tensor_backing_map,
+                                             TensorBackingMap{},
                                              runtime_arg_config};
 
     SUBCASE("LocalSlotsBacking::allocate_tensors_by_role") {
       auto get_result_shape_and_dtype_for_tensor_guid_and_map =
-          [&](tensor_guid_t t,
-              TensorBackingMap m) -> std::pair<ArrayShape, DataType> {
-        GenericTensorAccessorW accessor = m.at(t);
+          [&](tensor_guid_t t, layer_guid_t l,
+              LayerTensorBackingMap m) -> std::pair<ArrayShape, DataType> {
+        GenericTensorAccessorW accessor = m.at(LayerTensorKey{l, lower(t)});
         return get_shape_and_datatype(accessor);
       };
 
@@ -101,7 +108,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         SUBCASE("Query grad") {
           std::pair<ArrayShape, DataType> result =
               get_result_shape_and_dtype_for_tensor_guid_and_map(
-                  query_guid, local_slots_backing.gradient_tensor_mapping);
+                  query_guid, layer_guid, local_slots_backing.gradient_tensor_mapping);
           std::pair<ArrayShape, DataType> correct = {ArrayShape{query_shape},
                                                      dtype};
           CHECK(result == correct);
@@ -109,7 +116,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         SUBCASE("Key grad") {
           std::pair<ArrayShape, DataType> result =
               get_result_shape_and_dtype_for_tensor_guid_and_map(
-                  key_guid, local_slots_backing.gradient_tensor_mapping);
+                  key_guid, layer_guid, local_slots_backing.gradient_tensor_mapping);
           std::pair<ArrayShape, DataType> correct = {ArrayShape{key_shape},
                                                      dtype};
           CHECK(result == correct);
@@ -117,7 +124,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         SUBCASE("Value grad") {
           std::pair<ArrayShape, DataType> result =
               get_result_shape_and_dtype_for_tensor_guid_and_map(
-                  value_guid, local_slots_backing.gradient_tensor_mapping);
+                  value_guid, layer_guid, local_slots_backing.gradient_tensor_mapping);
           std::pair<ArrayShape, DataType> correct = {ArrayShape{value_shape},
                                                      dtype};
           CHECK(result == correct);
@@ -132,7 +139,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         SUBCASE("Output") {
           std::pair<ArrayShape, DataType> result =
               get_result_shape_and_dtype_for_tensor_guid_and_map(
-                  output_guid, local_slots_backing.tensor_mapping);
+                  output_guid, layer_guid, local_slots_backing.tensor_mapping);
           std::pair<ArrayShape, DataType> correct = {
               ArrayShape{
                   get_tensor_attrs(cg_builder.computation_graph, output_guid)
@@ -143,7 +150,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         SUBCASE("Output grad") {
           std::pair<ArrayShape, DataType> result =
               get_result_shape_and_dtype_for_tensor_guid_and_map(
-                  output_guid, local_slots_backing.gradient_tensor_mapping);
+                  output_guid, layer_guid, local_slots_backing.gradient_tensor_mapping);
           std::pair<ArrayShape, DataType> correct = {
               ArrayShape{
                   get_tensor_attrs(cg_builder.computation_graph, output_guid)
@@ -159,19 +166,19 @@ TEST_SUITE(FF_TEST_SUITE) {
         SUBCASE("Input tensor slots") {
           std::vector<tensor_guid_t> correct_incoming_input_tensors =
               get_incoming_inputs(cg_builder.computation_graph, layer_guid);
-          CHECK(correct_incoming_input_tensors ==
+          CHECK(lower(correct_incoming_input_tensors) ==
                 local_slots_backing.input_tensor_slots.at(layer_guid));
         }
         SUBCASE("Weight tensor slots") {
           std::vector<tensor_guid_t> correct_incoming_weight_tensors =
               get_incoming_weights(cg_builder.computation_graph, layer_guid);
-          CHECK(correct_incoming_weight_tensors ==
+          CHECK(lower(correct_incoming_weight_tensors) ==
                 local_slots_backing.weight_tensor_slots.at(layer_guid));
         }
         SUBCASE("Output tensor slots") {
           std::vector<tensor_guid_t> correct_outgoing_tensors =
               get_outgoing_tensors(cg_builder.computation_graph, layer_guid);
-          CHECK(correct_outgoing_tensors ==
+          CHECK(lower(correct_outgoing_tensors) ==
                 local_slots_backing.output_tensor_slots.at(layer_guid));
         }
       }
@@ -231,12 +238,12 @@ TEST_SUITE(FF_TEST_SUITE) {
               allocator.allocate_tensor(output_attrs.shape);
           return get_slots_backing_without_tensor_allocation_addresses(
               TensorSlotsBacking{
-                  {SlotGradId{slot_id_t{QUERY}, IsGrad::NO}, query},
-                  {SlotGradId{slot_id_t{KEY}, IsGrad::NO}, key},
-                  {SlotGradId{slot_id_t{VALUE}, IsGrad::NO}, value},
-                  {SlotGradId{slot_id_t{WEIGHTS}, IsGrad::NO}, weights},
-                  {SlotGradId{slot_id_t{OUTPUT}, IsGrad::NO}, output},
-                  {SlotGradId{slot_id_t{QUERY}, IsGrad::YES}, query}});
+                  {SlotTensorTypeId{slot_id_t{QUERY}, TensorType::FORWARD}, query},
+                  {SlotTensorTypeId{slot_id_t{KEY}, TensorType::FORWARD}, key},
+                  {SlotTensorTypeId{slot_id_t{VALUE}, TensorType::FORWARD}, value},
+                  {SlotTensorTypeId{slot_id_t{WEIGHTS}, TensorType::FORWARD}, weights},
+                  {SlotTensorTypeId{slot_id_t{OUTPUT}, TensorType::FORWARD}, output},
+                  {SlotTensorTypeId{slot_id_t{QUERY}, TensorType::GRADIENT}, query}});
         }();
 
         CHECK(result == correct);
diff --git a/lib/local-execution/test/src/test_local_task_arg_accessor.cc b/lib/local-execution/test/src/test_local_task_arg_accessor.cc
index f52fccb1ed..bddda7acd1 100644
--- a/lib/local-execution/test/src/test_local_task_arg_accessor.cc
+++ b/lib/local-execution/test/src/test_local_task_arg_accessor.cc
@@ -37,68 +37,68 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     TensorSlotsBacking tensor_slots_backing = {
-        {SlotGradId{slot_id_t{INPUT}, IsGrad::NO}, input},
-        {SlotGradId{slot_id_t{INPUT}, IsGrad::YES}, input_grad},
-        {SlotGradId{slot_id_t{VARIADIC_TENSORS}, IsGrad::NO}, variadic_tensors},
-        {SlotGradId{slot_id_t{VARIADIC_TENSORS}, IsGrad::YES},
+        {SlotTensorTypeId{slot_id_t{INPUT}, TensorType::FORWARD}, input},
+        {SlotTensorTypeId{slot_id_t{INPUT}, TensorType::GRADIENT}, input_grad},
+        {SlotTensorTypeId{slot_id_t{VARIADIC_TENSORS}, TensorType::FORWARD}, variadic_tensors},
+        {SlotTensorTypeId{slot_id_t{VARIADIC_TENSORS}, TensorType::GRADIENT},
          variadic_tensors_grad},
     };
 
     LocalTaskArgumentAccessor acc = {allocator, tensor_slots_backing, {}};
 
     SUBCASE("get_tensor") {
-      SUBCASE("get_tensor(slot_id_t, Permissions::RO, IsGrad::NO)") {
+      SUBCASE("get_tensor(slot_id_t, Permissions::RO, TensorType::FORWARD)") {
         GenericTensorAccessor correct = GenericTensorAccessor{
             read_only_accessor_from_write_accessor(input)};
         GenericTensorAccessor result =
-            acc.get_tensor(slot_id_t{INPUT}, Permissions::RO, IsGrad::NO);
+            acc.get_tensor(slot_id_t{INPUT}, Permissions::RO, TensorType::FORWARD);
         CHECK(correct == result);
       }
-      SUBCASE("get_tensor(slot_id_t, Permissions::RO, IsGrad::YES)") {
+      SUBCASE("get_tensor(slot_id_t, Permissions::RO, TensorType::GRADIENT)") {
         GenericTensorAccessor correct = GenericTensorAccessor{
             read_only_accessor_from_write_accessor(input_grad)};
         GenericTensorAccessor result =
-            acc.get_tensor(slot_id_t{INPUT}, Permissions::RO, IsGrad::YES);
+            acc.get_tensor(slot_id_t{INPUT}, Permissions::RO, TensorType::GRADIENT);
         CHECK(correct == result);
       }
-      SUBCASE("get_tensor(slot_id_t, Permissions::WO, IsGrad::NO)") {
+      SUBCASE("get_tensor(slot_id_t, Permissions::WO, TensorType::FORWARD)") {
         GenericTensorAccessor correct = GenericTensorAccessor{input};
         GenericTensorAccessor result =
-            acc.get_tensor(slot_id_t{INPUT}, Permissions::WO, IsGrad::NO);
+            acc.get_tensor(slot_id_t{INPUT}, Permissions::WO, TensorType::FORWARD);
         CHECK(correct == result);
       }
-      SUBCASE("get_tensor(slot_id_t, Permissions::WO, IsGrad::YES)") {
+      SUBCASE("get_tensor(slot_id_t, Permissions::WO, TensorType::GRADIENT)") {
         GenericTensorAccessor correct = GenericTensorAccessor{input_grad};
         GenericTensorAccessor result =
-            acc.get_tensor(slot_id_t{INPUT}, Permissions::WO, IsGrad::YES);
+            acc.get_tensor(slot_id_t{INPUT}, Permissions::WO, TensorType::GRADIENT);
         CHECK(correct == result);
       }
-      SUBCASE("get_tensor(slot_id_t, Permissions::RW, IsGrad::NO)") {
+      SUBCASE("get_tensor(slot_id_t, Permissions::RW, TensorType::FORWARD)") {
         GenericTensorAccessor correct = GenericTensorAccessor{input};
         GenericTensorAccessor result =
-            acc.get_tensor(slot_id_t{INPUT}, Permissions::RW, IsGrad::NO);
+            acc.get_tensor(slot_id_t{INPUT}, Permissions::RW, TensorType::FORWARD);
         CHECK(correct == result);
       }
-      SUBCASE("get_tensor(slot_id_t, Permissions::RW, IsGrad::YES)") {
+      SUBCASE("get_tensor(slot_id_t, Permissions::RW, TensorType::GRADIENT)") {
         GenericTensorAccessor correct = GenericTensorAccessor{input_grad};
         GenericTensorAccessor result =
-            acc.get_tensor(slot_id_t{INPUT}, Permissions::RW, IsGrad::YES);
+            acc.get_tensor(slot_id_t{INPUT}, Permissions::RW, TensorType::GRADIENT);
         CHECK(correct == result);
       }
     }
 
     SUBCASE("get_variadic_tensor") {
-      SUBCASE("get_variadic_tensor(slot_id_t, Permissions::RO, IsGrad::NO)") {
+      SUBCASE("get_variadic_tensor(slot_id_t, Permissions::RO, TensorType::FORWARD)") {
         VariadicGenericTensorAccessor correct =
             VariadicGenericTensorAccessor{std::vector<GenericTensorAccessorR>{
                 read_only_accessor_from_write_accessor(variadic_tensors.at(0)),
                 read_only_accessor_from_write_accessor(
                     variadic_tensors.at(1))}};
         VariadicGenericTensorAccessor result = acc.get_variadic_tensor(
-            slot_id_t{VARIADIC_TENSORS}, Permissions::RO, IsGrad::NO);
+            slot_id_t{VARIADIC_TENSORS}, Permissions::RO, TensorType::FORWARD);
         CHECK(result == correct);
       }
-      SUBCASE("get_variadic_tensor(slot_id_t, Permissions::RO, IsGrad::YES)") {
+      SUBCASE("get_variadic_tensor(slot_id_t, Permissions::RO, TensorType::GRADIENT)") {
         VariadicGenericTensorAccessor correct =
             VariadicGenericTensorAccessor{std::vector<GenericTensorAccessorR>{
                 read_only_accessor_from_write_accessor(
@@ -106,35 +106,35 @@ TEST_SUITE(FF_TEST_SUITE) {
                 read_only_accessor_from_write_accessor(
                     variadic_tensors_grad.at(1))}};
         VariadicGenericTensorAccessor result = acc.get_variadic_tensor(
-            slot_id_t{VARIADIC_TENSORS}, Permissions::RO, IsGrad::YES);
+            slot_id_t{VARIADIC_TENSORS}, Permissions::RO, TensorType::GRADIENT);
         CHECK(result == correct);
       }
-      SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, IsGrad::NO)") {
+      SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, TensorType::FORWARD)") {
         VariadicGenericTensorAccessor correct =
             VariadicGenericTensorAccessor{variadic_tensors};
         VariadicGenericTensorAccessor result = acc.get_variadic_tensor(
-            slot_id_t{VARIADIC_TENSORS}, Permissions::WO, IsGrad::NO);
+            slot_id_t{VARIADIC_TENSORS}, Permissions::WO, TensorType::FORWARD);
         CHECK(result == correct);
       }
-      SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, IsGrad::YES)") {
+      SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, TensorType::GRADIENT)") {
         VariadicGenericTensorAccessor correct =
             VariadicGenericTensorAccessor{variadic_tensors_grad};
         VariadicGenericTensorAccessor result = acc.get_variadic_tensor(
-            slot_id_t{VARIADIC_TENSORS}, Permissions::WO, IsGrad::YES);
+            slot_id_t{VARIADIC_TENSORS}, Permissions::WO, TensorType::GRADIENT);
         CHECK(result == correct);
       }
-      SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, IsGrad::NO)") {
+      SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, TensorType::FORWARD)") {
         VariadicGenericTensorAccessor correct =
             VariadicGenericTensorAccessor{variadic_tensors};
         VariadicGenericTensorAccessor result = acc.get_variadic_tensor(
-            slot_id_t{VARIADIC_TENSORS}, Permissions::RW, IsGrad::NO);
+            slot_id_t{VARIADIC_TENSORS}, Permissions::RW, TensorType::FORWARD);
         CHECK(result == correct);
       }
-      SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, IsGrad::YES)") {
+      SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, TensorType::GRADIENT)") {
         VariadicGenericTensorAccessor correct =
             VariadicGenericTensorAccessor{variadic_tensors_grad};
         VariadicGenericTensorAccessor result = acc.get_variadic_tensor(
-            slot_id_t{VARIADIC_TENSORS}, Permissions::RW, IsGrad::YES);
+            slot_id_t{VARIADIC_TENSORS}, Permissions::RW, TensorType::GRADIENT);
         CHECK(result == correct);
       }
     }
diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc
index c4662d624c..5793d02f31 100644
--- a/lib/local-execution/test/src/test_loss_e2e.cc
+++ b/lib/local-execution/test/src/test_loss_e2e.cc
@@ -1,4 +1,5 @@
 #include "doctest/doctest.h"
+#include "local-execution/tensor_reduction.h"
 #include "kernels/local_cuda_allocator.h"
 #include "kernels/managed_ff_stream.h"
 #include "kernels/managed_per_device_ff_handle.h"
@@ -35,61 +36,53 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     std::string layer_name = "scalar multiply";
     tensor_guid_t logit_tensor =
         cg_builder.scalar_multiply(input_tensor, scalar, layer_name);
+    layer_guid_t layer_guid = get_layer_by_name(cg_builder.computation_graph, layer_name);
 
     // allocate memory
     Allocator allocator = create_local_cuda_memory_allocator();
-    TensorBackingMap tensor_backing_map;
-    GenericTensorAccessorW input_backing =
-        allocator.allocate_tensor(input_shape);
-    tensor_backing_map.insert({input_tensor, input_backing});
 
     LocalTrainingBacking local_backing(allocator,
                                        cg_builder.computation_graph,
-                                       tensor_backing_map,
+                                       LayerTensorBackingMap{},
+                                       TensorBackingMap{},
                                        runtime_arg_config);
-    // for (layer_guid_t const & node:
-    // topological_ordering(cg_builder.computation_graph)) {
-    //   local_backing.register_and_allocate_layer(node);
-    // }
-    local_backing.register_and_allocate_layer(
-        get_layer_by_name(cg_builder.computation_graph, layer_name));
+
+    local_backing.register_and_allocate_layer(layer_guid);
 
     SUBCASE("SparseCategoricalCrossEntropyLossAttrs") {
       TensorShape label_shape = TensorShape{
           TensorDims{FFOrdered<size_t>{batch_size, 1}}, DataType::FLOAT};
-      tensor_guid_t label_tensor =
-          cg_builder.create_input(label_shape, CreateGrad::NO);
+      reduced_tensor_t label_tensor = reduced_tensor_t{-1};
       GenericTensorAccessorW label_backing =
           allocator.allocate_tensor(label_shape);
-      local_backing.insert_tensor(label_tensor, label_backing);
+      local_backing.local_slots_backing.non_graph_tensor_mapping.insert({label_tensor, label_backing});
       LossAttrs loss_attrs = LossAttrs{
           SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}};
-      local_backing.compute_loss(loss_attrs, logit_tensor, label_tensor);
+      local_backing.compute_loss(loss_attrs, lower(logit_tensor), label_tensor);
     }
 
     SUBCASE("NonconfigurableLossAttrs") {
-      tensor_guid_t label_tensor =
-          cg_builder.create_input(input_shape, CreateGrad::NO);
+      reduced_tensor_t label_tensor = reduced_tensor_t{-1};
       GenericTensorAccessorW label_backing =
           allocator.allocate_tensor(input_shape);
-      local_backing.insert_tensor(label_tensor, label_backing);
+      local_backing.local_slots_backing.non_graph_tensor_mapping.insert({label_tensor, label_backing});
 
       SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") {
         LossAttrs loss_attrs = LossAttrs{
             NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
-        local_backing.compute_loss(loss_attrs, logit_tensor, label_tensor);
+        local_backing.compute_loss(loss_attrs, lower(logit_tensor), label_tensor);
       }
 
       SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") {
         LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{
             LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}};
-        local_backing.compute_loss(loss_attrs, logit_tensor, label_tensor);
+        local_backing.compute_loss(loss_attrs, lower(logit_tensor), label_tensor);
       }
 
       SUBCASE("LossFunction::IDENTITY") {
         LossAttrs loss_attrs =
             LossAttrs{NonconfigurableLossAttrs{LossFunction::IDENTITY}};
-        local_backing.compute_loss(loss_attrs, logit_tensor, label_tensor);
+        local_backing.compute_loss(loss_attrs, lower(logit_tensor), label_tensor);
       }
     }
   }
diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc
index b48214d89d..2e5e386a95 100644
--- a/lib/local-execution/test/src/test_update_e2e.cc
+++ b/lib/local-execution/test/src/test_update_e2e.cc
@@ -3,6 +3,7 @@
 #include "kernels/managed_ff_stream.h"
 #include "kernels/managed_per_device_ff_handle.h"
 #include "local-execution/local_training_backing.h"
+#include "local-execution/tensor_reduction.h"
 #include "pcg/computation_graph.h"
 #include "pcg/computation_graph_builder.h"
 #include "pcg/optimizer_attrs.dtg.h"
@@ -37,14 +38,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     // allocate memory
     Allocator allocator = create_local_cuda_memory_allocator();
-    TensorBackingMap tensor_backing_map;
-    GenericTensorAccessorW input_backing =
-        allocator.allocate_tensor(input_shape);
-    tensor_backing_map.insert({input_tensor, input_backing});
-
     LocalTrainingBacking local_backing(allocator,
                                        cg_builder.computation_graph,
-                                       tensor_backing_map,
+                                       LayerTensorBackingMap{},
+                                       TensorBackingMap{},
                                        runtime_arg_config);
     // for (layer_guid_t const & node:
     // topological_ordering(cg_builder.computation_graph)) {
diff --git a/lib/op-attrs/include/op-attrs/operator_attrs.h b/lib/op-attrs/include/op-attrs/operator_attrs.h
index 73473d6ac5..483e735196 100644
--- a/lib/op-attrs/include/op-attrs/operator_attrs.h
+++ b/lib/op-attrs/include/op-attrs/operator_attrs.h
@@ -1,35 +1,35 @@
 #ifndef _OPERATOR_PARAMS_H
 #define _OPERATOR_PARAMS_H
 
-#include "local-execution/ops/attention.h"
-#include "local-execution/ops/batch_matmul.h"
-#include "local-execution/ops/batch_norm.h"
-#include "local-execution/ops/broadcast.h"
-#include "local-execution/ops/cast.h"
-#include "local-execution/ops/combine.h"
-#include "local-execution/ops/concat.h"
-#include "local-execution/ops/conv_2d.h"
-#include "local-execution/ops/dropout.h"
-#include "local-execution/ops/element_binary.h"
-#include "local-execution/ops/element_unary.h"
-#include "local-execution/ops/embedding.h"
-#include "local-execution/ops/flat.h"
-#include "local-execution/ops/gather.h"
-#include "local-execution/ops/input.h"
-#include "local-execution/ops/layer_norm.h"
-#include "local-execution/ops/linear.h"
-#include "local-execution/ops/noop.h"
-#include "local-execution/ops/pool_2d.h"
-#include "local-execution/ops/reduce.h"
-#include "local-execution/ops/reduction.h"
-#include "local-execution/ops/repartition.h"
-#include "local-execution/ops/replicate.h"
-#include "local-execution/ops/reshape.h"
-#include "local-execution/ops/reverse.h"
-#include "local-execution/ops/softmax.h"
-#include "local-execution/ops/split.h"
-#include "local-execution/ops/topk.h"
-#include "local-execution/ops/transpose.h"
+#include "op-attrs/ops/attention.h"
+#include "op-attrs/ops/batch_matmul.h"
+#include "op-attrs/ops/batch_norm.h"
+#include "op-attrs/ops/broadcast.h"
+#include "op-attrs/ops/cast.h"
+#include "op-attrs/ops/combine.h"
+#include "op-attrs/ops/concat.h"
+#include "op-attrs/ops/conv_2d.h"
+#include "op-attrs/ops/dropout.h"
+#include "op-attrs/ops/element_binary.h"
+#include "op-attrs/ops/element_unary.h"
+#include "op-attrs/ops/embedding.h"
+#include "op-attrs/ops/flat.h"
+#include "op-attrs/ops/gather.h"
+#include "op-attrs/ops/input.h"
+#include "op-attrs/ops/layer_norm.h"
+#include "op-attrs/ops/linear.h"
+#include "op-attrs/ops/noop.h"
+#include "op-attrs/ops/pool_2d.h"
+#include "op-attrs/ops/reduce.h"
+#include "op-attrs/ops/reduction.h"
+#include "op-attrs/ops/repartition.h"
+#include "op-attrs/ops/replicate.h"
+#include "op-attrs/ops/reshape.h"
+#include "op-attrs/ops/reverse.h"
+#include "op-attrs/ops/softmax.h"
+#include "op-attrs/ops/split.h"
+#include "op-attrs/ops/topk.h"
+#include "op-attrs/ops/transpose.h"
 #include "op-attrs/ops/core.h"
 #include "op-attrs/pcg_operator_attrs.dtg.h"
 #include "utils/record_formatter.h"
diff --git a/lib/pcg/include/pcg/optimizer_attrs.h b/lib/pcg/include/pcg/optimizer_attrs.h
index d4abd1b52f..1d74694c29 100644
--- a/lib/pcg/include/pcg/optimizer_attrs.h
+++ b/lib/pcg/include/pcg/optimizer_attrs.h
@@ -6,7 +6,7 @@
 
 namespace FlexFlow {
 
-OptimizerAttrs get_next_iteration_optimizer_attrs(OptimizerAttrs const &old);
+OptimizerAttrs get_optimizer_attrs_for_next_iter(OptimizerAttrs const &old);
 
 } // namespace FlexFlow
 
diff --git a/lib/pcg/src/pcg/optimizer_attrs.cc b/lib/pcg/src/pcg/optimizer_attrs.cc
index 8d66f7af7e..ce2d3d0db7 100644
--- a/lib/pcg/src/pcg/optimizer_attrs.cc
+++ b/lib/pcg/src/pcg/optimizer_attrs.cc
@@ -3,7 +3,7 @@
 namespace FlexFlow {
 
 OptimizerAttrs
-    get_next_iteration_optimizer_attrs(OptimizerAttrs const &old_attrs) {
+    get_optimizer_attrs_for_next_iter(OptimizerAttrs const &old_attrs) {
   if (old_attrs.has<AdamOptimizerAttrs>()) {
     AdamOptimizerAttrs old = old_attrs.get<AdamOptimizerAttrs>();
     double new_beta1_t = old.beta_t * old.beta1;

From 9d252b37ef49d92cd358346472a9a94589ad7f81 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Wed, 15 Jan 2025 14:32:29 -0800
Subject: [PATCH 24/91] Remove tensor lower

---
 .../local-execution/tensor_reduction.h        |  2 --
 .../src/local_slots_backing.cc                | 18 +++++++++++---
 lib/local-execution/src/tensor_reduction.cc   |  7 ------
 .../test/src/test_local_slots_backing.cc      | 24 ++++++++++++-------
 4 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/lib/local-execution/include/local-execution/tensor_reduction.h b/lib/local-execution/include/local-execution/tensor_reduction.h
index eb55b07ee4..2cb0b12ff0 100644
--- a/lib/local-execution/include/local-execution/tensor_reduction.h
+++ b/lib/local-execution/include/local-execution/tensor_reduction.h
@@ -8,8 +8,6 @@ namespace FlexFlow {
 
 reduced_tensor_t lower(tensor_guid_t const &);
 
-std::vector<reduced_tensor_t> lower(std::vector<tensor_guid_t> const &);
-
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc
index f1bb5a9a5b..8a277adc78 100644
--- a/lib/local-execution/src/local_slots_backing.cc
+++ b/lib/local-execution/src/local_slots_backing.cc
@@ -43,15 +43,27 @@ void LocalSlotsBacking::allocate_tensors_by_role(
   switch (role) {
     case TensorRole::INPUT:
       tensors = get_incoming_inputs(computation_graph, layer_guid);
-      this->input_tensor_slots.insert({layer_guid, lower(tensors)});
+      this->input_tensor_slots.insert({layer_guid, 
+        transform(tensors, [&](tensor_guid_t const &tensor_guid) {
+          return lower(tensor_guid);
+        })
+      });
       break;
     case TensorRole::WEIGHT:
       tensors = get_incoming_weights(computation_graph, layer_guid);
-      this->weight_tensor_slots.insert({layer_guid, lower(tensors)});
+      this->weight_tensor_slots.insert({layer_guid, 
+        transform(tensors, [&](tensor_guid_t const &tensor_guid) {
+          return lower(tensor_guid);
+        })
+      });
       break;
     case TensorRole::OUTPUT:
       tensors = get_outgoing_tensors(computation_graph, layer_guid);
-      this->output_tensor_slots.insert({layer_guid, lower(tensors)});
+      this->output_tensor_slots.insert({layer_guid, 
+        transform(tensors, [&](tensor_guid_t const &tensor_guid) {
+          return lower(tensor_guid);
+        })
+      });
       break;
     default:
       throw mk_runtime_error("Invalid tensor role, got {}", role);
diff --git a/lib/local-execution/src/tensor_reduction.cc b/lib/local-execution/src/tensor_reduction.cc
index 1d0cb7a2e9..ae5b188dfd 100644
--- a/lib/local-execution/src/tensor_reduction.cc
+++ b/lib/local-execution/src/tensor_reduction.cc
@@ -7,11 +7,4 @@ reduced_tensor_t lower(tensor_guid_t const &tensor_guid) {
   return reduced_tensor_t{tensor_guid.raw_graph_output.node.raw_uid};
 }
 
-std::vector<reduced_tensor_t>
-    lower(std::vector<tensor_guid_t> const &tensor_guids) {
-  return transform(tensor_guids, [&](tensor_guid_t const &tensor_guid) {
-    return lower(tensor_guid);
-  });
-}
-
 } // namespace FlexFlow
diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc
index c9e95fe444..88dfa34783 100644
--- a/lib/local-execution/test/src/test_local_slots_backing.cc
+++ b/lib/local-execution/test/src/test_local_slots_backing.cc
@@ -164,21 +164,27 @@ TEST_SUITE(FF_TEST_SUITE) {
         local_slots_backing.allocate_layer_tensors(
             layer_guid, cg_builder.computation_graph, allocator);
         SUBCASE("Input tensor slots") {
-          std::vector<tensor_guid_t> correct_incoming_input_tensors =
-              get_incoming_inputs(cg_builder.computation_graph, layer_guid);
-          CHECK(lower(correct_incoming_input_tensors) ==
+          std::vector<reduced_tensor_t> correct_incoming_input_tensors = 
+            transform(get_incoming_inputs(cg_builder.computation_graph, layer_guid), [&](tensor_guid_t const &tensor_guid) {
+              return lower(tensor_guid);
+            });
+          CHECK(correct_incoming_input_tensors ==
                 local_slots_backing.input_tensor_slots.at(layer_guid));
         }
         SUBCASE("Weight tensor slots") {
-          std::vector<tensor_guid_t> correct_incoming_weight_tensors =
-              get_incoming_weights(cg_builder.computation_graph, layer_guid);
-          CHECK(lower(correct_incoming_weight_tensors) ==
+          std::vector<reduced_tensor_t> correct_incoming_weight_tensors = 
+            transform(get_incoming_weights(cg_builder.computation_graph, layer_guid), [&](tensor_guid_t const &tensor_guid) {
+              return lower(tensor_guid);
+            });
+          CHECK(correct_incoming_weight_tensors ==
                 local_slots_backing.weight_tensor_slots.at(layer_guid));
         }
         SUBCASE("Output tensor slots") {
-          std::vector<tensor_guid_t> correct_outgoing_tensors =
-              get_outgoing_tensors(cg_builder.computation_graph, layer_guid);
-          CHECK(lower(correct_outgoing_tensors) ==
+          std::vector<reduced_tensor_t> correct_output_tensors = 
+            transform(get_outgoing_tensors(cg_builder.computation_graph, layer_guid), [&](tensor_guid_t const &tensor_guid) {
+              return lower(tensor_guid);
+            });
+          CHECK(correct_output_tensors ==
                 local_slots_backing.output_tensor_slots.at(layer_guid));
         }
       }

From 895c117100a0ac4cdb1cc1dead37f2efbe3786f9 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Thu, 16 Jan 2025 17:04:17 -0800
Subject: [PATCH 25/91] Add tensor and task lowering scheme

---
 .../layer_tensor_key.struct.toml              |  21 --
 .../local-execution/local_args_backing.h      |  37 +++
 .../local-execution/local_slots_backing.h     |  89 ------
 .../local-execution/local_tensor_backing.h    |  58 ++++
 .../local-execution/local_training_backing.h  |  25 +-
 .../include/local-execution/loss_functions.h  |   4 +-
 ....struct.toml => loss_tensor_t.struct.toml} |   2 +-
 .../local-execution/lowered_tensor_source.h   |  21 ++
 ...ruct.toml => lowered_tensor_t.struct.toml} |   6 +-
 .../local-execution/model_training_instance.h |  12 +-
 .../local-execution/op_task_invocation.h      |   6 +-
 .../op_tensor_slot_spec.struct.toml           |   6 +-
 .../include/local-execution/optimizer.h       |  15 +-
 .../local-execution/optimizer_tensor_source.h |  21 ++
 .../optimizer_tensor_t.struct.toml            |  13 +
 .../local-execution/slot_grad_id.struct.toml  |  21 ++
 .../include/local-execution/task_binding.h    |  24 +-
 .../{tensor_reduction.h => tensor_lowering.h} |   4 +-
 .../local-execution/tensor_type.enum.toml     |   2 +-
 .../tensor_type_t.variant.toml                |  26 ++
 lib/local-execution/src/local_args_backing.cc |  62 ++++
 .../src/local_cost_estimator.cc               |   2 +-
 .../src/local_slots_backing.cc                | 270 ------------------
 .../src/local_tensor_backing.cc               | 123 ++++++++
 .../src/local_training_backing.cc             | 109 ++++---
 lib/local-execution/src/loss_functions.cc     |  14 +-
 .../src/lowered_tensor_source.cc              |  13 +
 .../src/model_training_instance.cc            |   8 +-
 lib/local-execution/src/op_task_invocation.cc |  12 +-
 lib/local-execution/src/op_task_signature.cc  |  18 +-
 lib/local-execution/src/optimizer.cc          |  28 +-
 .../src/optimizer_tensor_source.cc            |  13 +
 lib/local-execution/src/task_binding.cc       |  45 ++-
 lib/local-execution/src/tensor_lowering.cc    |  10 +
 lib/local-execution/src/tensor_reduction.cc   |  10 -
 35 files changed, 624 insertions(+), 526 deletions(-)
 delete mode 100644 lib/local-execution/include/local-execution/layer_tensor_key.struct.toml
 create mode 100644 lib/local-execution/include/local-execution/local_args_backing.h
 delete mode 100644 lib/local-execution/include/local-execution/local_slots_backing.h
 create mode 100644 lib/local-execution/include/local-execution/local_tensor_backing.h
 rename lib/local-execution/include/local-execution/{reduced_tensor_t.struct.toml => loss_tensor_t.struct.toml} (82%)
 create mode 100644 lib/local-execution/include/local-execution/lowered_tensor_source.h
 rename lib/local-execution/include/local-execution/{non_graph_tensor_guid_t.struct.toml => lowered_tensor_t.struct.toml} (62%)
 create mode 100644 lib/local-execution/include/local-execution/optimizer_tensor_source.h
 create mode 100644 lib/local-execution/include/local-execution/optimizer_tensor_t.struct.toml
 create mode 100644 lib/local-execution/include/local-execution/slot_grad_id.struct.toml
 rename lib/local-execution/include/local-execution/{tensor_reduction.h => tensor_lowering.h} (67%)
 create mode 100644 lib/local-execution/include/local-execution/tensor_type_t.variant.toml
 create mode 100644 lib/local-execution/src/local_args_backing.cc
 delete mode 100644 lib/local-execution/src/local_slots_backing.cc
 create mode 100644 lib/local-execution/src/local_tensor_backing.cc
 create mode 100644 lib/local-execution/src/lowered_tensor_source.cc
 create mode 100644 lib/local-execution/src/optimizer_tensor_source.cc
 create mode 100644 lib/local-execution/src/tensor_lowering.cc
 delete mode 100644 lib/local-execution/src/tensor_reduction.cc

diff --git a/lib/local-execution/include/local-execution/layer_tensor_key.struct.toml b/lib/local-execution/include/local-execution/layer_tensor_key.struct.toml
deleted file mode 100644
index 33a7a9174f..0000000000
--- a/lib/local-execution/include/local-execution/layer_tensor_key.struct.toml
+++ /dev/null
@@ -1,21 +0,0 @@
-namespace = "FlexFlow"
-name = "LayerTensorKey"
-features = [
-  "eq",
-  "ord",
-  "hash",
-  "fmt",
-]
-
-includes = [
-  "pcg/layer_guid_t.dtg.h",
-  "local-execution/reduced_tensor_t.dtg.h"
-]
-
-[[fields]]
-name = "layer_guid"
-type = "::FlexFlow::layer_guid_t"
-
-[[fields]]
-name = "reduced_tensor"
-type = "::FlexFlow::reduced_tensor_t"
diff --git a/lib/local-execution/include/local-execution/local_args_backing.h b/lib/local-execution/include/local-execution/local_args_backing.h
new file mode 100644
index 0000000000..d497c49738
--- /dev/null
+++ b/lib/local-execution/include/local-execution/local_args_backing.h
@@ -0,0 +1,37 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ARGS_BACKING_H
+#define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ARGS_BACKING_H
+
+#include "pcg/layer_guid_t.dtg.h"
+#include "pcg/computation_graph.h"
+#include "local-execution/per_device_op_state.h"
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/runtime_arg_config.h"
+#include "local-execution/task_invocation.dtg.h"
+#include "local-execution/local_task_argument_accessor.h"
+
+namespace FlexFlow {
+
+struct LocalArgsBacking {
+  LocalArgsBacking(RuntimeArgConfig const &);
+
+public:
+  void add_per_device_op_state(layer_guid_t const &,
+                               DeviceSpecificDeviceStates const &);
+
+  ArgSlotsBacking construct_arg_slots_backing(TaskBinding const &) const;
+
+  ConcreteArgSpec lower_to_concrete_arg_spec(RuntimeArgRefSpec const &) const;
+  ConcreteArgSpec lower_to_concrete_arg_spec(OpArgRefSpec const &,
+                                             ComputationGraph const &,
+                                             layer_guid_t const &) const;
+
+public:
+  // arguments
+  std::unordered_map<layer_guid_t, DeviceSpecificDeviceStates>
+      per_device_op_states;
+  RuntimeArgConfig runtime_arg_config;
+};
+
+}
+
+#endif
diff --git a/lib/local-execution/include/local-execution/local_slots_backing.h b/lib/local-execution/include/local-execution/local_slots_backing.h
deleted file mode 100644
index a632f432cf..0000000000
--- a/lib/local-execution/include/local-execution/local_slots_backing.h
+++ /dev/null
@@ -1,89 +0,0 @@
-
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_SLOTS_BACKING_H
-#define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_SLOTS_BACKING_H
-
-#include "kernels/accessor.h"
-#include "local-execution/layer_tensor_key.dtg.h"
-#include "local-execution/local_task_argument_accessor.h"
-#include "local-execution/non_graph_tensor_guid_t.dtg.h"
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/per_device_op_state.h"
-#include "local-execution/runtime_arg_config.h"
-#include "local-execution/task_invocation.dtg.h"
-#include "local-execution/tensor_role.dtg.h"
-#include "pcg/computation_graph.dtg.h"
-#include "pcg/tensor_guid_t.dtg.h"
-
-namespace FlexFlow {
-
-using LayerTensorBackingMap =
-    std::unordered_map<LayerTensorKey, GenericTensorAccessorW>;
-
-using TensorBackingMap =
-    std::unordered_map<reduced_tensor_t, GenericTensorAccessorW>;
-
-struct LocalSlotsBacking {
-  LocalSlotsBacking(LayerTensorBackingMap const &allocated_forward_tensors,
-                    TensorBackingMap const &allocated_non_graph_tensors,
-                    RuntimeArgConfig const &);
-
-public:
-  void add_per_device_op_state(layer_guid_t const &,
-                               DeviceSpecificDeviceStates const &);
-  void allocate_layer_tensors(layer_guid_t const &,
-                              ComputationGraph const &,
-                              Allocator &);
-  void allocate_tensors_by_role(TensorRole const &,
-                                layer_guid_t const &,
-                                ComputationGraph const &,
-                                Allocator &);
-  void allocate_optimizer_tensors(layer_guid_t const &weight_layer,
-                                  tensor_guid_t const &,
-                                  ComputationGraph const &,
-                                  Allocator &,
-                                  TaskSignature const &);
-  TensorSlotsBacking construct_tensor_slots_backing(OpTaskBinding const &,
-                                                    layer_guid_t const &) const;
-  TensorSlotsBacking
-      construct_tensor_slots_backing(TaskBinding const &,
-                                     std::optional<layer_guid_t> const &) const;
-  ArgSlotsBacking construct_arg_slots_backing(OpTaskBinding const &,
-                                              layer_guid_t const &) const;
-  ArgSlotsBacking construct_arg_slots_backing(TaskBinding const &) const;
-
-  ConcreteArgSpec resolve_runtime_arg_ref_spec(RuntimeArgRefSpec const &) const;
-  ConcreteArgSpec resolve_op_arg_ref_spec(OpArgRefSpec const &,
-                                          layer_guid_t const &) const;
-
-  GenericTensorAccessorW const &
-      get_tensor_backing(TensorType const &,
-                         reduced_tensor_t const &,
-                         std::optional<layer_guid_t> const &) const;
-
-  bool is_forward_tensor_allocated(LayerTensorKey const &) const;
-  bool is_non_graph_tensor_allocated(reduced_tensor_t const &) const;
-
-public:
-  // tensors
-  LayerTensorBackingMap tensor_mapping;
-  LayerTensorBackingMap gradient_tensor_mapping;
-  LayerTensorBackingMap optimizer_tensor_mapping;
-  TensorBackingMap non_graph_tensor_mapping;
-  std::unordered_map<layer_guid_t, std::vector<reduced_tensor_t>>
-      input_tensor_slots;
-  std::unordered_map<layer_guid_t, std::vector<reduced_tensor_t>>
-      weight_tensor_slots;
-  std::unordered_map<layer_guid_t, std::vector<reduced_tensor_t>>
-      output_tensor_slots;
-  std::unordered_map<layer_guid_t, std::vector<reduced_tensor_t>>
-      weight_optimizer_tensor_guids;
-
-  // arguments
-  std::unordered_map<layer_guid_t, DeviceSpecificDeviceStates>
-      per_device_op_states;
-  RuntimeArgConfig runtime_arg_config;
-};
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/local_tensor_backing.h b/lib/local-execution/include/local-execution/local_tensor_backing.h
new file mode 100644
index 0000000000..68a38253f8
--- /dev/null
+++ b/lib/local-execution/include/local-execution/local_tensor_backing.h
@@ -0,0 +1,58 @@
+
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TENSOR_BACKING_H
+#define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TENSOR_BACKING_H
+
+#include "kernels/accessor.h"
+#include "local-execution/local_task_argument_accessor.h"
+#include "local-execution/task_invocation.dtg.h"
+#include "local-execution/tensor_role.dtg.h"
+#include "local-execution/lowered_tensor_t.dtg.h"
+#include "local-execution/lowered_tensor_source.h"
+#include "local-execution/optimizer_tensor_t.dtg.h"
+#include "local-execution/loss_tensor_t.dtg.h"
+#include "pcg/computation_graph.dtg.h"
+#include "pcg/tensor_guid_t.dtg.h"
+#include "pcg/layer_guid_t.dtg.h"
+
+namespace FlexFlow {
+
+using TensorBackingMap =
+    std::unordered_map<lowered_tensor_t, GenericTensorAccessorW>;
+
+struct LocalTensorBacking {
+  LocalTensorBacking();
+
+public:
+  void allocate_layer_tensors(layer_guid_t const &,
+                              ComputationGraph const &,
+                              Allocator &);
+  void allocate_tensors_by_role(TensorRole const &,
+                                layer_guid_t const &,
+                                ComputationGraph const &,
+                                Allocator &);
+  void allocate_optimizer_tensors(tensor_guid_t const &,
+                                  std::vector<optimizer_tensor_t> const &,
+                                  Allocator &);
+  TensorSlotsBacking
+      construct_tensor_slots_backing(TaskBinding const &) const;
+
+  GenericTensorAccessorW const &
+      get_tensor_backing(lowered_tensor_t const &) const;
+
+  bool is_tensor_allocated(lowered_tensor_t const &) const;
+
+public:
+  // tensors
+  TensorBackingMap tensor_backings;
+  
+  std::unordered_map<tensor_guid_t, lowered_tensor_t> tensor_lowering_mapping;
+  std::unordered_map<tensor_guid_t, lowered_tensor_t> gradient_tensor_lowering_mapping;
+  std::unordered_map<optimizer_tensor_t, lowered_tensor_t> optimizer_tensor_lowering_mapping;
+  std::unordered_map<loss_tensor_t, lowered_tensor_t> loss_tensor_lowering_mapping;
+
+  LoweredTensorSource lowered_tensor_source;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h
index 26ebfbe3c4..a915f3e420 100644
--- a/lib/local-execution/include/local-execution/local_training_backing.h
+++ b/lib/local-execution/include/local-execution/local_training_backing.h
@@ -1,11 +1,13 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H
 #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H
 
-#include "local-execution/local_slots_backing.h"
+#include "local-execution/local_tensor_backing.h"
+#include "local-execution/local_args_backing.h"
 #include "local-execution/task_registry.h"
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
 #include "pcg/computation_graph.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
+#include "local-execution/optimizer_tensor_source.h"
 
 namespace FlexFlow {
 
@@ -15,8 +17,6 @@ using PerLayerElapsedTime =
 struct LocalTrainingBacking {
   LocalTrainingBacking(Allocator const &,
                        ComputationGraph const &,
-                       LayerTensorBackingMap const &allocated_forward_tensors,
-                       TensorBackingMap const &allocated_non_graph_tensors,
                        RuntimeArgConfig const &);
   void register_and_allocate_layer(layer_guid_t const &);
   void allocate_layer_optimizer_tensors(layer_guid_t const &,
@@ -25,17 +25,18 @@ struct LocalTrainingBacking {
   void execute_init(layer_guid_t const &);
   std::optional<float> execute_forward(layer_guid_t const &);
   void compute_loss(LossAttrs const &loss_attrs,
-                    reduced_tensor_t const &logit_tensor,
-                    reduced_tensor_t const &label_tensor);
+                    tensor_guid_t const &logit_tensor,
+                    loss_tensor_t const &label_tensor);
   std::optional<float> execute_backward(layer_guid_t const &);
   void execute_update(layer_guid_t const &, OptimizerAttrs const &);
 
   TaskArgumentAccessor
-      get_task_arg_accessor(TaskInvocation const &,
-                            std::optional<layer_guid_t> const &) const;
-  TaskArgumentAccessor get_op_task_arg_accessor(OpTaskInvocation const &,
-                                                layer_guid_t const &) const;
-  LocalSlotsBacking local_slots_backing;
+      get_task_arg_accessor(TaskInvocation const &) const;
+
+  TaskInvocation lower_to_task_invocation(OpTaskInvocation const &, layer_guid_t const &) const;
+
+  LocalTensorBacking local_tensor_backing;
+  LocalArgsBacking local_args_backing;
 
 private:
   DeviceSpecificDeviceStates call_init_task_impl(task_id_t,
@@ -46,6 +47,10 @@ struct LocalTrainingBacking {
   Allocator allocator;
   ComputationGraph computation_graph;
   TaskRegistry task_registry;
+
+  // optimizer
+  OptimizerTensorSource optimizer_tensor_source;
+  std::unordered_map<layer_guid_t, std::vector<optimizer_tensor_t>> layer_optimizer_tensor_ids;
 };
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/include/local-execution/loss_functions.h b/lib/local-execution/include/local-execution/loss_functions.h
index 4ce74da766..f56f2b05b1 100644
--- a/lib/local-execution/include/local-execution/loss_functions.h
+++ b/lib/local-execution/include/local-execution/loss_functions.h
@@ -20,13 +20,15 @@
 #include "local-execution/task_invocation.dtg.h"
 #include "local-execution/task_signature.h"
 #include "op-attrs/ops/loss_functions.h"
+#include "pcg/tensor_guid_t.dtg.h"
+#include "local-execution/loss_tensor_t.dtg.h"
 
 namespace FlexFlow {
 
 TaskImplFunction get_loss_bwd_task_impl();
 TaskSignature get_loss_bwd_signature();
 TaskInvocation
-    backward(LossAttrs const &, reduced_tensor_t logit, reduced_tensor_t label);
+    backward(LossAttrs const &, tensor_guid_t logit, loss_tensor_t label);
 
 } // namespace FlexFlow
 
diff --git a/lib/local-execution/include/local-execution/reduced_tensor_t.struct.toml b/lib/local-execution/include/local-execution/loss_tensor_t.struct.toml
similarity index 82%
rename from lib/local-execution/include/local-execution/reduced_tensor_t.struct.toml
rename to lib/local-execution/include/local-execution/loss_tensor_t.struct.toml
index 726249c970..0d0d428a1b 100644
--- a/lib/local-execution/include/local-execution/reduced_tensor_t.struct.toml
+++ b/lib/local-execution/include/local-execution/loss_tensor_t.struct.toml
@@ -1,5 +1,5 @@
 namespace = "FlexFlow"
-name = "reduced_tensor_t"
+name = "loss_tensor_t"
 features = [
   "eq",
   "ord",
diff --git a/lib/local-execution/include/local-execution/lowered_tensor_source.h b/lib/local-execution/include/local-execution/lowered_tensor_source.h
new file mode 100644
index 0000000000..63cc2cd31e
--- /dev/null
+++ b/lib/local-execution/include/local-execution/lowered_tensor_source.h
@@ -0,0 +1,21 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_LOWERED_TENSOR_SOURCE_H
+#define _FLEXFLOW_LOCAL_EXECUTION_LOWERED_TENSOR_SOURCE_H
+
+#include "local-execution/lowered_tensor_t.dtg.h"
+
+namespace FlexFlow {
+
+struct LoweredTensorSource {
+public:
+  LoweredTensorSource();
+
+  lowered_tensor_t new_lowered_tensor();
+
+private:
+  static size_t next_available_lowered_tensor_id;
+};
+  
+} // namespace FlexFlow
+
+
+#endif
diff --git a/lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml b/lib/local-execution/include/local-execution/lowered_tensor_t.struct.toml
similarity index 62%
rename from lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml
rename to lib/local-execution/include/local-execution/lowered_tensor_t.struct.toml
index 4832ecaafa..287e548a5b 100644
--- a/lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml
+++ b/lib/local-execution/include/local-execution/lowered_tensor_t.struct.toml
@@ -1,13 +1,13 @@
 namespace = "FlexFlow"
-name = "non_graph_tensor_guid_t"
+name = "lowered_tensor_t"
 features = [
   "eq",
   "ord",
   "hash",
   "fmt",
-  "json",
 ]
 
+
 [[fields]]
-name = "raw_uid"
+name = "raw_index"
 type = "int"
diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h
index 5cc13f0b40..dd6a6f33d7 100644
--- a/lib/local-execution/include/local-execution/model_training_instance.h
+++ b/lib/local-execution/include/local-execution/model_training_instance.h
@@ -3,6 +3,8 @@
 
 #include "local-execution/local_training_backing.h"
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
+#include "pcg/tensor_guid_t.dtg.h"
+#include "local-execution/loss_tensor_t.dtg.h"
 
 namespace FlexFlow {
 
@@ -12,12 +14,10 @@ using PerLayerElapsedTime =
 struct ModelTrainingInstance {
   ModelTrainingInstance(Allocator const &,
                         ComputationGraph const &,
-                        LayerTensorBackingMap const &allocated_forward_tensors,
-                        TensorBackingMap const &allocated_non_graph_tensors,
                         RuntimeArgConfig const &,
                         LossAttrs const &,
-                        reduced_tensor_t const &logit_tensor,
-                        reduced_tensor_t const &label_tensor,
+                        tensor_guid_t const &logit_tensor,
+                        loss_tensor_t const &label_tensor,
                         OptimizerAttrs const &);
 
   void execute_init();
@@ -28,8 +28,8 @@ struct ModelTrainingInstance {
   ComputationGraph computation_graph;
   LocalTrainingBacking training_backing;
   LossAttrs loss_attrs;
-  reduced_tensor_t logit_tensor;
-  reduced_tensor_t label_tensor;
+  tensor_guid_t logit_tensor;
+  loss_tensor_t label_tensor;
   OptimizerAttrs optimizer_attrs;
 };
 
diff --git a/lib/local-execution/include/local-execution/op_task_invocation.h b/lib/local-execution/include/local-execution/op_task_invocation.h
index 6484981ebf..0f351c3a0e 100644
--- a/lib/local-execution/include/local-execution/op_task_invocation.h
+++ b/lib/local-execution/include/local-execution/op_task_invocation.h
@@ -10,7 +10,7 @@
 #include "local-execution/op_tensor_spec.h"
 #include "local-execution/profiling.h"
 #include "local-execution/runtime_arg_ref.h"
-#include "local-execution/slot_tensor_type_id.dtg.h"
+#include "local-execution/slot_grad_id.dtg.h"
 #include "local-execution/task_id_t.dtg.h"
 #include "local-execution/variadic_tensor_ref.h"
 #include <typeindex>
@@ -84,14 +84,14 @@ struct OpTaskBinding {
   bool operator==(OpTaskBinding const &other) const;
   bool operator!=(OpTaskBinding const &other) const;
 
-  std::unordered_map<SlotTensorTypeId, OpTensorSpec> const &
+  std::unordered_map<SlotGradId, OpTensorSpec> const &
       get_tensor_bindings() const;
   std::unordered_map<slot_id_t, OpArgSpec> const &get_arg_bindings() const;
 
   void bind_from_forward(OpTaskBinding const &fwd);
 
 private:
-  std::unordered_map<SlotTensorTypeId, OpTensorSpec> tensor_bindings;
+  std::unordered_map<SlotGradId, OpTensorSpec> tensor_bindings;
   std::unordered_map<slot_id_t, OpArgSpec> arg_bindings;
 
 private:
diff --git a/lib/local-execution/include/local-execution/op_tensor_slot_spec.struct.toml b/lib/local-execution/include/local-execution/op_tensor_slot_spec.struct.toml
index 54638a7eb6..590dbe6362 100644
--- a/lib/local-execution/include/local-execution/op_tensor_slot_spec.struct.toml
+++ b/lib/local-execution/include/local-execution/op_tensor_slot_spec.struct.toml
@@ -11,7 +11,7 @@ includes = [
   "local-execution/slot_id_t.dtg.h",
   "local-execution/slot_type.dtg.h",
   "local-execution/tensor_role.dtg.h",
-  "local-execution/tensor_type.dtg.h",
+  "local-execution/is_grad.dtg.h",
   "local-execution/op_slot_options.dtg.h",
 ]
 
@@ -28,8 +28,8 @@ name = "tensor_role"
 type = "::FlexFlow::TensorRole"
 
 [[fields]]
-name = "tensor_type"
-type = "::FlexFlow::TensorType"
+name = "is_grad"
+type = "::FlexFlow::IsGrad"
 
 [[fields]]
 name = "slot_option"
diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h
index 2eb480a0c1..f0dd610a1f 100644
--- a/lib/local-execution/include/local-execution/optimizer.h
+++ b/lib/local-execution/include/local-execution/optimizer.h
@@ -1,7 +1,6 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_
 #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_
 
-#include "local-execution/non_graph_tensor_guid_t.dtg.h"
 #include "local-execution/task_impl_function.dtg.h"
 #include "local-execution/task_invocation.dtg.h"
 #include "local-execution/task_signature.h"
@@ -14,21 +13,21 @@ namespace FlexFlow {
 TaskSignature get_update_signature(OptimizerAttrs const &);
 TaskInvocation get_update_invocation(
     OptimizerAttrs const &,
-    reduced_tensor_t const &weight,
-    std::vector<reduced_tensor_t> const &grad_buffer_tensors);
+    tensor_guid_t const &weight,
+    std::vector<optimizer_tensor_t> const &grad_buffer_tensors);
 TaskImplFunction get_update_task_impl(OptimizerAttrs const &);
 
 TaskSignature get_sgd_update_signature();
 TaskInvocation sgd_update(SGDOptimizerAttrs const &,
-                          reduced_tensor_t const &weight,
-                          reduced_tensor_t const &sgd_v);
+                          tensor_guid_t const &weight,
+                          optimizer_tensor_t const &sgd_v);
 TaskImplFunction get_sgd_update_task_impl();
 
 TaskSignature get_adam_update_signature();
 TaskInvocation adam_update(AdamOptimizerAttrs const &,
-                           reduced_tensor_t const &weight,
-                           reduced_tensor_t const &adam_v,
-                           reduced_tensor_t const &adam_m);
+                           tensor_guid_t const &weight,
+                           optimizer_tensor_t const &adam_v,
+                           optimizer_tensor_t const &adam_m);
 TaskImplFunction get_adam_update_task_impl();
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/include/local-execution/optimizer_tensor_source.h b/lib/local-execution/include/local-execution/optimizer_tensor_source.h
new file mode 100644
index 0000000000..fc5015b299
--- /dev/null
+++ b/lib/local-execution/include/local-execution/optimizer_tensor_source.h
@@ -0,0 +1,21 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_OPTIMIZER_TENSOR_SOURCE_H
+#define _FLEXFLOW_LOCAL_EXECUTION_OPTIMIZER_TENSOR_SOURCE_H
+
+#include "local-execution/optimizer_tensor_t.dtg.h"
+
+namespace FlexFlow {
+
+struct OptimizerTensorSource {
+public:
+  OptimizerTensorSource();
+
+  optimizer_tensor_t new_optimizer_tensor();
+
+private:
+  static size_t next_available_optimizer_tensor_id;
+};
+  
+} // namespace FlexFlow
+
+
+#endif
diff --git a/lib/local-execution/include/local-execution/optimizer_tensor_t.struct.toml b/lib/local-execution/include/local-execution/optimizer_tensor_t.struct.toml
new file mode 100644
index 0000000000..5d3e05f673
--- /dev/null
+++ b/lib/local-execution/include/local-execution/optimizer_tensor_t.struct.toml
@@ -0,0 +1,13 @@
+namespace = "FlexFlow"
+name = "optimizer_tensor_t"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "fmt",
+]
+
+
+[[fields]]
+name = "raw_index"
+type = "int"
diff --git a/lib/local-execution/include/local-execution/slot_grad_id.struct.toml b/lib/local-execution/include/local-execution/slot_grad_id.struct.toml
new file mode 100644
index 0000000000..256091d272
--- /dev/null
+++ b/lib/local-execution/include/local-execution/slot_grad_id.struct.toml
@@ -0,0 +1,21 @@
+namespace = "FlexFlow"
+name = "SlotGradId"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "local-execution/is_grad.dtg.h",
+  "local-execution/slot_id_t.dtg.h",
+]
+
+[[fields]]
+name = "slot_id"
+type = "::FlexFlow::slot_id_t"
+
+[[fields]]
+name = "is_grad"
+type = "::FlexFlow::IsGrad"
diff --git a/lib/local-execution/include/local-execution/task_binding.h b/lib/local-execution/include/local-execution/task_binding.h
index e211592ea6..33636616b3 100644
--- a/lib/local-execution/include/local-execution/task_binding.h
+++ b/lib/local-execution/include/local-execution/task_binding.h
@@ -1,20 +1,32 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H
 #define _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H
 
-#include "local-execution/reduced_tensor_t.dtg.h"
+#include "local-execution/lowered_tensor_t.dtg.h"
 #include "local-execution/slot_id_t.dtg.h"
 #include "local-execution/slot_tensor_type_id.dtg.h"
 #include "local-execution/task_arg_spec.dtg.h"
 #include "local-execution/task_id_t.dtg.h"
 #include "local-execution/task_signature.dtg.h"
+#include "local-execution/optimizer_tensor_t.dtg.h"
+#include "local-execution/loss_tensor_t.dtg.h"
+#include "local-execution/tensor_type_t.dtg.h"
 
 namespace FlexFlow {
 
 struct TaskBinding {
   TaskBinding() = default;
 
-  void bind(int, TensorType const &, reduced_tensor_t const &);
-  void bind(slot_id_t, TensorType const &, reduced_tensor_t const &);
+  void bind(int, tensor_guid_t const &);
+  void bind(slot_id_t,  tensor_guid_t const &);
+
+  void bind_grad(int, tensor_guid_t const &);
+  void bind_grad(slot_id_t,  tensor_guid_t const &);
+
+  void bind(int, optimizer_tensor_t const &);
+  void bind(slot_id_t, optimizer_tensor_t const &);
+
+  void bind(int, loss_tensor_t const &);
+  void bind(slot_id_t, loss_tensor_t const &);
 
   template <typename T>
   void bind_arg(int name, T const &t) {
@@ -39,16 +51,16 @@ struct TaskBinding {
   bool operator==(TaskBinding const &other) const;
   bool operator!=(TaskBinding const &other) const;
 
-  std::unordered_map<SlotTensorTypeId, reduced_tensor_t> const &
+  std::unordered_map<SlotTensorTypeId, TensorTypeVariant> const &
       get_tensor_bindings() const;
   std::unordered_map<slot_id_t, TaskArgSpec> const &get_arg_bindings() const;
+  void insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec);
 
 private:
-  std::unordered_map<SlotTensorTypeId, reduced_tensor_t> tensor_bindings;
+  std::unordered_map<SlotTensorTypeId, TensorTypeVariant> tensor_bindings;
   std::unordered_map<slot_id_t, TaskArgSpec> arg_bindings;
 
 private:
-  void insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec);
   std::tuple<decltype(tensor_bindings) const &, decltype(arg_bindings) const &>
       tie() const;
 };
diff --git a/lib/local-execution/include/local-execution/tensor_reduction.h b/lib/local-execution/include/local-execution/tensor_lowering.h
similarity index 67%
rename from lib/local-execution/include/local-execution/tensor_reduction.h
rename to lib/local-execution/include/local-execution/tensor_lowering.h
index 2cb0b12ff0..5f3870c1d2 100644
--- a/lib/local-execution/include/local-execution/tensor_reduction.h
+++ b/lib/local-execution/include/local-execution/tensor_lowering.h
@@ -1,12 +1,12 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_TENSOR_REDUCTION_H
 #define _FLEXFLOW_LOCAL_EXECUTION_TENSOR_REDUCTION_H
 
-#include "local-execution/reduced_tensor_t.dtg.h"
+#include "local-execution/lowered_tensor_t.dtg.h"
 #include "pcg/tensor_guid_t.dtg.h"
 
 namespace FlexFlow {
 
-reduced_tensor_t lower(tensor_guid_t const &);
+lowered_tensor_t lower(tensor_guid_t const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/local-execution/include/local-execution/tensor_type.enum.toml b/lib/local-execution/include/local-execution/tensor_type.enum.toml
index 31ce5ba83a..b1ae8fa667 100644
--- a/lib/local-execution/include/local-execution/tensor_type.enum.toml
+++ b/lib/local-execution/include/local-execution/tensor_type.enum.toml
@@ -8,7 +8,7 @@ features = [
 ]
 
 [[values]]
-name = "NON_GRAPH"
+name = "LOSS"
 
 [[values]]
 name = "FORWARD"
diff --git a/lib/local-execution/include/local-execution/tensor_type_t.variant.toml b/lib/local-execution/include/local-execution/tensor_type_t.variant.toml
new file mode 100644
index 0000000000..d4e525c348
--- /dev/null
+++ b/lib/local-execution/include/local-execution/tensor_type_t.variant.toml
@@ -0,0 +1,26 @@
+namespace = "FlexFlow"
+name = "TensorTypeVariant"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "pcg/tensor_guid_t.dtg.h",
+  "local-execution/optimizer_tensor_t.dtg.h",
+  "local-execution/loss_tensor_t.dtg.h"
+]
+
+[[values]]
+type = "::FlexFlow::tensor_guid_t"
+key = "tensor_guid"
+
+[[values]]
+type = "::FlexFlow::optimizer_tensor_t"
+key = "optimizer_tensor"
+
+[[values]]
+type = "::FlexFlow::loss_tensor_t"
+key = "loss_tensor"
diff --git a/lib/local-execution/src/local_args_backing.cc b/lib/local-execution/src/local_args_backing.cc
new file mode 100644
index 0000000000..0c3cfe70e8
--- /dev/null
+++ b/lib/local-execution/src/local_args_backing.cc
@@ -0,0 +1,62 @@
+#include "local-execution/local_args_backing.h"
+#include "utils/containers/map_values.h"
+#include "utils/containers/contains_key.h"
+#include "utils/overload.h"
+#include "op-attrs/parallel_tensor_shape.h"
+
+namespace FlexFlow {
+
+
+void LocalArgsBacking::add_per_device_op_state(
+  layer_guid_t const &op_guid,
+    DeviceSpecificDeviceStates const &device_state) {
+  this->per_device_op_states.insert({op_guid, device_state});
+}
+
+ArgSlotsBacking LocalArgsBacking::construct_arg_slots_backing(
+    TaskBinding const &binding) const {
+  return map_values(
+      binding.get_arg_bindings(), [&](TaskArgSpec const &arg_binding) {
+        return arg_binding.template visit<ConcreteArgSpec>(
+            overload{[&](RuntimeArgRefSpec const &s) {
+                       return this->lower_to_concrete_arg_spec(s);
+                     },
+                     [](ConcreteArgSpec const &s) { return s; }});
+      });
+  ;
+}
+
+ConcreteArgSpec LocalArgsBacking::lower_to_concrete_arg_spec(
+    OpArgRefSpec const &op_arg_ref_spec, ComputationGraph const & cg, layer_guid_t const &op_guid) const {
+  if (op_arg_ref_spec.holds<DeviceSpecificDeviceStates>()) {
+    assert(contains_key(this->per_device_op_states, op_guid));
+    DeviceSpecificDeviceStates device_specific =
+        per_device_op_states.at(op_guid);
+    PerDeviceOpState device_state =
+        get_device_state_from_device_specific(device_specific, 0);
+    return ConcreteArgSpec::create(device_state);
+  } else if (op_arg_ref_spec.holds<ParallelTensorShape>()) {
+    ParallelTensorShapeRefType index_op_arg_ref =
+        op_arg_ref_spec.get_ref_type().get<ParallelTensorShapeRefType>();
+    tensor_guid_t input_tensor = get_incoming_inputs(cg, op_guid).at(index_op_arg_ref.idx);
+    TensorAttrs tensor_attrs = get_tensor_attrs(cg, input_tensor);
+    ParallelTensorShape shape = lift_to_parallel(tensor_attrs.shape);
+    return ConcreteArgSpec::create(shape);
+  } else {
+    throw mk_runtime_error("Unhandled op arg ref type");
+  }
+}
+
+ConcreteArgSpec LocalArgsBacking::lower_to_concrete_arg_spec(
+    RuntimeArgRefSpec const &runtime_arg_ref_spec) const {
+  if (runtime_arg_ref_spec.holds<DeviceSpecific<PerDeviceFFHandle>>()) {
+    return ConcreteArgSpec::create(
+        *(this->runtime_arg_config.ff_handle.get(0)));
+  } else if (runtime_arg_ref_spec.holds<ProfilingSettings>()) {
+    return ConcreteArgSpec::create(this->runtime_arg_config.profiling_settings);
+  } else {
+    throw mk_runtime_error("Unhandled runtime arg ref type");
+  }
+}
+
+}
diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
index 02265281b0..404064b7ce 100644
--- a/lib/local-execution/src/local_cost_estimator.cc
+++ b/lib/local-execution/src/local_cost_estimator.cc
@@ -1,5 +1,5 @@
 #include "local-execution/local_cost_estimator.h"
-#include "local-execution/tensor_reduction.h"
+#include "local-execution/tensor_lowering.h"
 #include "kernels/device.h"
 #include "kernels/local_cuda_allocator.h"
 #include "local-execution/tracked_allocator.h"
diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc
deleted file mode 100644
index 8a277adc78..0000000000
--- a/lib/local-execution/src/local_slots_backing.cc
+++ /dev/null
@@ -1,270 +0,0 @@
-#include "local-execution/local_slots_backing.h"
-#include "local-execution/tensor_reduction.h"
-#include "op-attrs/parallel_tensor_shape.h"
-#include "pcg/computation_graph.h"
-#include "utils/containers/contains_key.h"
-#include "utils/containers/map_values.h"
-#include "utils/overload.h"
-
-namespace FlexFlow {
-
-LocalSlotsBacking::LocalSlotsBacking(
-    LayerTensorBackingMap const &allocated_forward_tensors,
-    TensorBackingMap const &allocated_non_graph_tensors,
-    RuntimeArgConfig const &runtime_arg_config)
-    : tensor_mapping(allocated_forward_tensors),
-      non_graph_tensor_mapping(allocated_non_graph_tensors),
-      runtime_arg_config(runtime_arg_config){};
-
-void LocalSlotsBacking::add_per_device_op_state(
-    layer_guid_t const &op_guid,
-    DeviceSpecificDeviceStates const &device_state) {
-  this->per_device_op_states.insert({op_guid, device_state});
-}
-
-void LocalSlotsBacking::allocate_layer_tensors(
-    layer_guid_t const &layer_guid,
-    ComputationGraph const &computation_graph,
-    Allocator &allocator) {
-  this->allocate_tensors_by_role(
-      TensorRole::INPUT, layer_guid, computation_graph, allocator);
-  this->allocate_tensors_by_role(
-      TensorRole::WEIGHT, layer_guid, computation_graph, allocator);
-  this->allocate_tensors_by_role(
-      TensorRole::OUTPUT, layer_guid, computation_graph, allocator);
-}
-
-void LocalSlotsBacking::allocate_tensors_by_role(
-    TensorRole const &role,
-    layer_guid_t const &layer_guid,
-    ComputationGraph const &computation_graph,
-    Allocator &allocator) {
-  std::vector<tensor_guid_t> tensors;
-  switch (role) {
-    case TensorRole::INPUT:
-      tensors = get_incoming_inputs(computation_graph, layer_guid);
-      this->input_tensor_slots.insert({layer_guid, 
-        transform(tensors, [&](tensor_guid_t const &tensor_guid) {
-          return lower(tensor_guid);
-        })
-      });
-      break;
-    case TensorRole::WEIGHT:
-      tensors = get_incoming_weights(computation_graph, layer_guid);
-      this->weight_tensor_slots.insert({layer_guid, 
-        transform(tensors, [&](tensor_guid_t const &tensor_guid) {
-          return lower(tensor_guid);
-        })
-      });
-      break;
-    case TensorRole::OUTPUT:
-      tensors = get_outgoing_tensors(computation_graph, layer_guid);
-      this->output_tensor_slots.insert({layer_guid, 
-        transform(tensors, [&](tensor_guid_t const &tensor_guid) {
-          return lower(tensor_guid);
-        })
-      });
-      break;
-    default:
-      throw mk_runtime_error("Invalid tensor role, got {}", role);
-  }
-
-  for (tensor_guid_t const &tensor : tensors) {
-    TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor);
-    reduced_tensor_t reduced_tensor = lower(tensor);
-    LayerTensorKey layer_tensor_key =
-        LayerTensorKey{layer_guid, reduced_tensor};
-    // tensor allocation
-    if (!is_forward_tensor_allocated(layer_tensor_key)) {
-      GenericTensorAccessorW tensor_backing =
-          allocator.allocate_tensor(tensor_attrs.shape);
-      this->tensor_mapping.insert({layer_tensor_key, tensor_backing});
-    }
-
-    // gradient tensor allocation
-    if (tensor_attrs.create_gradients == CreateGrad::YES) {
-      GenericTensorAccessorW gradient_tensor_backing =
-          allocator.allocate_tensor(tensor_attrs.shape);
-      this->gradient_tensor_mapping.insert(
-          {layer_tensor_key, gradient_tensor_backing});
-    }
-  }
-}
-
-void LocalSlotsBacking::allocate_optimizer_tensors(
-    layer_guid_t const &weight_layer,
-    tensor_guid_t const &weight,
-    ComputationGraph const &cg,
-    Allocator &allocator,
-    TaskSignature const &sig) {
-  GenericTensorAccessorW weight_backing = this->get_tensor_backing(
-      TensorType::FORWARD, lower(weight), weight_layer);
-  int num_grad_buffer_tensors =
-      sig.tensor_guid_slots.size() - 2; // ignore 2 (weight and weight_grad)
-  std::vector<reduced_tensor_t> optimizer_buffer_tensors;
-  for (int i = 0; i < num_grad_buffer_tensors; ++i) {
-    reduced_tensor_t buffer_tensor = reduced_tensor_t{i};
-    GenericTensorAccessorW buffer_backing = allocator.allocate_tensor(
-        get_tensor_shape(weight_backing.shape, weight_backing.data_type));
-    this->optimizer_tensor_mapping.insert(
-        {LayerTensorKey{weight_layer, buffer_tensor}, buffer_backing});
-    optimizer_buffer_tensors.push_back(buffer_tensor);
-  }
-  this->weight_optimizer_tensor_guids.insert(
-      {weight_layer, optimizer_buffer_tensors});
-}
-
-bool LocalSlotsBacking::is_forward_tensor_allocated(
-    LayerTensorKey const &layer_tensor_id) const {
-  return contains_key(this->tensor_mapping, layer_tensor_id);
-}
-
-bool LocalSlotsBacking::is_non_graph_tensor_allocated(
-    reduced_tensor_t const &tensor_id) const {
-  return contains_key(this->non_graph_tensor_mapping, tensor_id);
-}
-
-GenericTensorAccessorW const &LocalSlotsBacking::get_tensor_backing(
-    TensorType const &tensor_type,
-    reduced_tensor_t const &tensor_id,
-    std::optional<layer_guid_t> const &layer_guid) const {
-  switch (tensor_type) {
-    case TensorType::FORWARD:
-      return this->tensor_mapping.at(
-          LayerTensorKey{layer_guid.value(), tensor_id});
-    case TensorType::NON_GRAPH:
-      return this->non_graph_tensor_mapping.at(tensor_id);
-    case TensorType::GRADIENT:
-      return this->gradient_tensor_mapping.at(
-          LayerTensorKey{layer_guid.value(), tensor_id});
-    case TensorType::OPTIMIZER:
-      return this->optimizer_tensor_mapping.at(
-          LayerTensorKey{layer_guid.value(), tensor_id});
-    default:
-      throw mk_runtime_error(
-          fmt::format("Invalid tensor type {}", tensor_type));
-  }
-}
-
-TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing(
-    OpTaskBinding const &binding, layer_guid_t const &op_guid) const {
-  TensorSlotsBacking mapping;
-
-  for (auto const &tensor_binding : binding.get_tensor_bindings()) {
-    SlotTensorTypeId slot_grad_id = tensor_binding.first;
-    OpTensorSpec tensor_spec = tensor_binding.second;
-    std::vector<reduced_tensor_t> tensor_guids;
-    int weight_adjusted_idx = 0;
-    switch (tensor_spec.role) {
-      case TensorRole::WEIGHT:
-        assert(contains_key(this->weight_tensor_slots, op_guid));
-        tensor_guids = this->weight_tensor_slots.at(op_guid);
-        break;
-      case TensorRole::INPUT:
-        assert(contains_key(this->input_tensor_slots, op_guid));
-        tensor_guids = this->input_tensor_slots.at(op_guid);
-        break;
-      case TensorRole::OUTPUT:
-        assert(contains_key(this->output_tensor_slots, op_guid));
-        tensor_guids = this->output_tensor_slots.at(op_guid);
-        break;
-      default:
-        throw mk_runtime_error(
-            fmt::format("Invalid TensorRole {}", tensor_spec.role));
-    }
-
-    mapping.insert({slot_grad_id,
-                    this->get_tensor_backing(slot_grad_id.tensor_type,
-                                             tensor_guids.at(tensor_spec.idx),
-                                             op_guid)});
-  }
-  return mapping;
-}
-
-TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing(
-    TaskBinding const &binding,
-    std::optional<layer_guid_t> const &layer_guid) const {
-  TensorSlotsBacking mapping;
-
-  for (auto const &tensor_binding : binding.get_tensor_bindings()) {
-    reduced_tensor_t tensor_id = tensor_binding.second;
-    SlotTensorTypeId slot_tensor_type_id = tensor_binding.first;
-    GenericTensorAccessorW accessor = this->get_tensor_backing(
-        slot_tensor_type_id.tensor_type, tensor_id, layer_guid);
-    mapping.insert({slot_tensor_type_id, accessor});
-  }
-
-  return mapping;
-}
-
-ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing(
-    OpTaskBinding const &binding, layer_guid_t const &op_guid) const {
-  return map_values(
-      binding.get_arg_bindings(), [&](OpArgSpec const &arg_binding) {
-        return arg_binding.template visit<ConcreteArgSpec>(
-            overload{[&](OpArgRefSpec const &s) {
-                       return this->resolve_op_arg_ref_spec(s, op_guid);
-                     },
-                     [&](RuntimeArgRefSpec const &s) {
-                       return this->resolve_runtime_arg_ref_spec(s);
-                     },
-                     [](ConcreteArgSpec const &s) { return s; }});
-      });
-}
-
-ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing(
-    TaskBinding const &binding) const {
-  return map_values(
-      binding.get_arg_bindings(), [&](TaskArgSpec const &arg_binding) {
-        return arg_binding.template visit<ConcreteArgSpec>(
-            overload{[&](RuntimeArgRefSpec const &s) {
-                       return this->resolve_runtime_arg_ref_spec(s);
-                     },
-                     [](ConcreteArgSpec const &s) { return s; }});
-      });
-  ;
-}
-
-ConcreteArgSpec LocalSlotsBacking::resolve_op_arg_ref_spec(
-    OpArgRefSpec const &op_arg_ref_spec, layer_guid_t const &op_guid) const {
-  if (op_arg_ref_spec.holds<DeviceSpecificDeviceStates>()) {
-    assert(contains_key(per_device_op_states, op_guid));
-    DeviceSpecificDeviceStates device_specific =
-        per_device_op_states.at(op_guid);
-    PerDeviceOpState device_state =
-        get_device_state_from_device_specific(device_specific, 0);
-    return ConcreteArgSpec::create(device_state);
-  } else if (op_arg_ref_spec.holds<ParallelTensorShape>()) {
-    ParallelTensorShapeRefType index_op_arg_ref =
-        op_arg_ref_spec.get_ref_type().get<ParallelTensorShapeRefType>();
-
-    assert(contains_key(this->input_tensor_slots, op_guid));
-    std::vector<reduced_tensor_t> input_tensor_guids =
-        this->input_tensor_slots.at(op_guid);
-
-    assert(input_tensor_guids.size() > index_op_arg_ref.idx);
-    GenericTensorAccessorW tensor_backing =
-        this->get_tensor_backing(TensorType::FORWARD,
-                                 input_tensor_guids.at(index_op_arg_ref.idx),
-                                 op_guid);
-    ParallelTensorShape shape = lift_to_parallel(
-        get_tensor_shape(tensor_backing.shape, tensor_backing.data_type));
-    return ConcreteArgSpec::create(shape);
-  } else {
-    throw mk_runtime_error("Unhandled op arg ref type");
-  }
-}
-
-ConcreteArgSpec LocalSlotsBacking::resolve_runtime_arg_ref_spec(
-    RuntimeArgRefSpec const &runtime_arg_ref_spec) const {
-  if (runtime_arg_ref_spec.holds<DeviceSpecific<PerDeviceFFHandle>>()) {
-    return ConcreteArgSpec::create(
-        *(this->runtime_arg_config.ff_handle.get(0)));
-  } else if (runtime_arg_ref_spec.holds<ProfilingSettings>()) {
-    return ConcreteArgSpec::create(this->runtime_arg_config.profiling_settings);
-  } else {
-    throw mk_runtime_error("Unhandled runtime arg ref type");
-  }
-}
-
-} // namespace FlexFlow
diff --git a/lib/local-execution/src/local_tensor_backing.cc b/lib/local-execution/src/local_tensor_backing.cc
new file mode 100644
index 0000000000..9da74c27b9
--- /dev/null
+++ b/lib/local-execution/src/local_tensor_backing.cc
@@ -0,0 +1,123 @@
+#include "local-execution/local_tensor_backing.h"
+#include "local-execution/tensor_lowering.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "pcg/computation_graph.h"
+#include "utils/containers/contains_key.h"
+#include "utils/overload.h"
+#include "local-execution/slot_grad_id.dtg.h"
+
+namespace FlexFlow {
+
+LocalTensorBacking::LocalTensorBacking() {};
+
+void LocalTensorBacking::allocate_layer_tensors(
+    layer_guid_t const &layer_guid,
+    ComputationGraph const &computation_graph,
+    Allocator &allocator) {
+  this->allocate_tensors_by_role(
+      TensorRole::INPUT, layer_guid, computation_graph, allocator);
+  this->allocate_tensors_by_role(
+      TensorRole::WEIGHT, layer_guid, computation_graph, allocator);
+  this->allocate_tensors_by_role(
+      TensorRole::OUTPUT, layer_guid, computation_graph, allocator);
+}
+
+void LocalTensorBacking::allocate_tensors_by_role(
+    TensorRole const &role,
+    layer_guid_t const &layer_guid,
+    ComputationGraph const &computation_graph,
+    Allocator &allocator) {
+  std::vector<tensor_guid_t> tensors;
+  switch (role) {
+    case TensorRole::INPUT:
+      tensors = get_incoming_inputs(computation_graph, layer_guid);
+      break;
+    case TensorRole::WEIGHT:
+      tensors = get_incoming_weights(computation_graph, layer_guid);
+      break;
+    case TensorRole::OUTPUT:
+      tensors = get_outgoing_tensors(computation_graph, layer_guid);
+      break;
+    default:
+      throw mk_runtime_error("Invalid tensor role, got {}", role);
+  }
+
+  for (tensor_guid_t const &tensor : tensors) {
+    TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor);
+    // tensor allocation
+    if (!contains_key(this->tensor_lowering_mapping, tensor)) {
+      lowered_tensor_t reduced_tensor = this->lowered_tensor_source.new_lowered_tensor();
+      this->tensor_lowering_mapping.insert({tensor, reduced_tensor});
+      GenericTensorAccessorW tensor_backing =
+          allocator.allocate_tensor(tensor_attrs.shape);
+      this->tensor_backings.insert({reduced_tensor, tensor_backing});
+    }
+
+    // gradient tensor allocation
+    if (tensor_attrs.create_gradients == CreateGrad::YES && !contains_key(this->gradient_tensor_lowering_mapping, tensor)) {
+      lowered_tensor_t reduced_tensor = this->lowered_tensor_source.new_lowered_tensor();
+      this->gradient_tensor_lowering_mapping.insert({tensor, reduced_tensor});
+      GenericTensorAccessorW gradient_tensor_backing =
+          allocator.allocate_tensor(tensor_attrs.shape);
+      this->tensor_backings.insert(
+          {reduced_tensor, gradient_tensor_backing});
+    }
+  }
+}
+
+void LocalTensorBacking::allocate_optimizer_tensors(
+    tensor_guid_t const &weight,
+    std::vector<optimizer_tensor_t> const& optimizer_tensors,
+    Allocator &allocator) {
+  GenericTensorAccessorW weight_backing = this->get_tensor_backing(this->tensor_lowering_mapping.at(weight));
+  for (optimizer_tensor_t const & optimizer_tensor: optimizer_tensors) {
+    // optimizer tensor allocation
+    if (!contains_key(this->optimizer_tensor_lowering_mapping, optimizer_tensor)) {
+      lowered_tensor_t buffer_tensor = this->lowered_tensor_source.new_lowered_tensor();
+      this->optimizer_tensor_lowering_mapping.insert({optimizer_tensor, buffer_tensor});
+      GenericTensorAccessorW buffer_backing = allocator.allocate_tensor(
+          get_tensor_shape(weight_backing.shape, weight_backing.data_type));
+      this->tensor_backings.insert({buffer_tensor, buffer_backing});
+    }
+  }
+}
+
+bool LocalTensorBacking::is_tensor_allocated(lowered_tensor_t const & tensor_id) const {
+  return contains_key(tensor_backings, tensor_id);
+}
+
+GenericTensorAccessorW const &LocalTensorBacking::get_tensor_backing(
+    lowered_tensor_t const &tensor_id) const {
+  return this->tensor_backings.at(tensor_id);
+}
+
+TensorSlotsBacking LocalTensorBacking::construct_tensor_slots_backing(
+    TaskBinding const &binding) const {
+  TensorSlotsBacking mapping;
+
+  for (auto const &tensor_binding : binding.get_tensor_bindings()) {
+    SlotTensorTypeId slot_tensor_type_id = tensor_binding.first;
+
+    lowered_tensor_t tensor_id = [&] {
+      TensorTypeVariant tensor_type = tensor_binding.second;
+      if (tensor_type.has<tensor_guid_t>() and slot_tensor_type_id.tensor_type == TensorType::FORWARD) {
+        return this->tensor_lowering_mapping.at(tensor_type.get<tensor_guid_t>());
+      } else if (tensor_type.has<tensor_guid_t>() and slot_tensor_type_id.tensor_type == TensorType::GRADIENT) {
+        return this->gradient_tensor_lowering_mapping.at(tensor_type.get<tensor_guid_t>());
+      } else if (tensor_type.has<optimizer_tensor_t>()) {
+        return this->optimizer_tensor_lowering_mapping.at(tensor_type.get<optimizer_tensor_t>());
+      } else if (tensor_type.has<loss_tensor_t>()) {
+        return this->loss_tensor_lowering_mapping.at(tensor_type.get<loss_tensor_t>());
+      } else {
+        throw mk_runtime_error(fmt::format("Tensor binding has invalid type"));
+      }
+    }();
+
+    GenericTensorAccessorW accessor = this->get_tensor_backing(tensor_id);
+    mapping.insert({slot_tensor_type_id, accessor});
+  }
+
+  return mapping;
+}
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index f02a8c7824..9b933dee9c 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -3,12 +3,13 @@
 #include "local-execution/optimizer.h"
 #include "local-execution/task_invocation.h"
 #include "local-execution/task_signature_impl.h"
-#include "local-execution/tensor_reduction.h"
+#include "local-execution/tensor_lowering.h"
 #include "pcg/computation_graph.h"
 #include "pcg/optimizer_attrs.h"
 #include "utils/containers/contains.h"
 #include "utils/containers/contains_key.h"
 #include "utils/containers/get_only.h"
+#include "utils/containers/values.h"
 #include "utils/exception.h"
 
 namespace FlexFlow {
@@ -16,20 +17,16 @@ namespace FlexFlow {
 LocalTrainingBacking::LocalTrainingBacking(
     Allocator const &allocator,
     ComputationGraph const &computation_graph,
-    LayerTensorBackingMap const &allocated_forward_tensors,
-    TensorBackingMap const &allocated_non_graph_tensors,
     RuntimeArgConfig const &runtime_arg_config)
     : allocator(allocator), computation_graph(computation_graph),
-      local_slots_backing(allocated_forward_tensors,
-                          allocated_non_graph_tensors,
-                          runtime_arg_config),
-      task_registry(empty_task_registry()) {}
+      local_args_backing(runtime_arg_config),
+      task_registry(empty_task_registry()) {};
 
 void LocalTrainingBacking::register_and_allocate_layer(
     layer_guid_t const &node) {
   ComputationGraphOpAttrs attrs =
       get_layer_attrs(this->computation_graph, node).attrs;
-  this->local_slots_backing.allocate_layer_tensors(
+  this->local_tensor_backing.allocate_layer_tensors(
       node, this->computation_graph, this->allocator);
   register_tasks_for_layer(this->task_registry, node, attrs);
 }
@@ -42,8 +39,14 @@ void LocalTrainingBacking::allocate_layer_optimizer_tensors(
     TaskSignature sig = get_update_signature(optimizer_attrs);
     tensor_guid_t weight_tensor =
         get_only(get_outgoing_tensors(this->computation_graph, node));
-    this->local_slots_backing.allocate_optimizer_tensors(
-        node, weight_tensor, this->computation_graph, this->allocator, sig);
+
+    std::vector<optimizer_tensor_t> optimizer_tensors;
+    for (TensorTypeSlotSpec const & tensor_type_slot_spec: values(sig.tensor_guid_slots)) {
+      optimizer_tensors.push_back(this->optimizer_tensor_source.new_optimizer_tensor());
+    }
+    this->layer_optimizer_tensor_ids.insert({node, optimizer_tensors});
+    this->local_tensor_backing.allocate_optimizer_tensors(
+        weight_tensor, optimizer_tensors, this->allocator);
   }
 }
 
@@ -73,12 +76,12 @@ void LocalTrainingBacking::execute_init(layer_guid_t const &operator_node) {
     ComputationGraphOpAttrs attrs =
         get_layer_attrs(this->computation_graph, operator_node).attrs;
 
-    OpTaskInvocation invocation = init(attrs);
+    TaskInvocation invocation = this->lower_to_task_invocation(init(attrs));
     TaskArgumentAccessor accessor =
-        this->get_op_task_arg_accessor(invocation, operator_node);
+        this->get_task_arg_accessor(invocation);
     DeviceSpecificDeviceStates device_state =
         this->call_init_task_impl(invocation.task_id, accessor);
-    this->local_slots_backing.add_per_device_op_state(operator_node,
+    this->local_args_backing.add_per_device_op_state(operator_node,
                                                       device_state);
   }
 }
@@ -90,9 +93,9 @@ std::optional<float>
     ComputationGraphOpAttrs attrs =
         get_layer_attrs(this->computation_graph, operator_node).attrs;
 
-    OpTaskInvocation invocation = forward(attrs);
+    TaskInvocation invocation = this->lower_to_task_invocation(forward(attrs));
     TaskArgumentAccessor accessor =
-        this->get_op_task_arg_accessor(invocation, operator_node);
+        this->get_task_arg_accessor(invocation);
     return this->call_task_impl(invocation.task_id, accessor);
   } else {
     return std::nullopt;
@@ -100,15 +103,14 @@ std::optional<float>
 }
 
 void LocalTrainingBacking::compute_loss(LossAttrs const &loss_attrs,
-                                        reduced_tensor_t const &logit_tensor,
-                                        reduced_tensor_t const &label_tensor) {
-  assert(this->local_slots_backing.is_non_graph_tensor_allocated(label_tensor));
+                                        tensor_guid_t const &logit_tensor,
+                                        loss_tensor_t const &label_tensor) {
   TaskInvocation loss_invocation =
       backward(loss_attrs, logit_tensor, label_tensor);
   // TODO: https://github.com/flexflow/flexflow-train/issues/1442
   // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation));
   TaskArgumentAccessor loss_accessor =
-      this->get_task_arg_accessor(loss_invocation, std::nullopt);
+      this->get_task_arg_accessor(loss_invocation);
   TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl();
   loss_impl_fn.get<GenericTaskImplFunction>().function_ptr(loss_accessor);
 }
@@ -120,9 +122,9 @@ std::optional<float>
     ComputationGraphOpAttrs attrs =
         get_layer_attrs(this->computation_graph, operator_node).attrs;
 
-    OpTaskInvocation invocation = backward(attrs);
+    TaskInvocation invocation = this->lower_to_task_invocation(backward(attrs));
     TaskArgumentAccessor accessor =
-        this->get_op_task_arg_accessor(invocation, operator_node);
+        this->get_task_arg_accessor(invocation);
     return this->call_task_impl(invocation.task_id, accessor);
   } else {
     return std::nullopt;
@@ -134,10 +136,8 @@ void LocalTrainingBacking::execute_update(
   LayerAttrs layer_attrs = get_layer_attrs(this->computation_graph, node);
   if (layer_attrs.attrs.has<WeightAttrs>()) {
     // get tensors
-    reduced_tensor_t weight_tensor =
-        lower(get_only(get_outgoing_tensors(this->computation_graph, node)));
-    std::vector<reduced_tensor_t> optimizer_buffer_tensors =
-        this->local_slots_backing.weight_optimizer_tensor_guids.at(node);
+    tensor_guid_t weight_tensor = get_only(get_outgoing_tensors(this->computation_graph, node));
+    std::vector<optimizer_tensor_t> optimizer_buffer_tensors = this->layer_optimizer_tensor_ids.at(node);
 
     // get invocation
     TaskInvocation invocation = get_update_invocation(
@@ -148,35 +148,62 @@ void LocalTrainingBacking::execute_update(
 
     // execute update
     TaskArgumentAccessor accessor =
-        this->get_task_arg_accessor(invocation, node);
+        this->get_task_arg_accessor(invocation);
     TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs);
     update_impl_fn.get<GenericTaskImplFunction>().function_ptr(accessor);
   }
 }
 
 TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor(
-    TaskInvocation const &invocation,
-    std::optional<layer_guid_t> const &layer_guid) const {
+    TaskInvocation const &invocation) const {
   TensorSlotsBacking tensor_slots_backing =
-      this->local_slots_backing.construct_tensor_slots_backing(
-          invocation.binding, layer_guid);
+      this->local_tensor_backing.construct_tensor_slots_backing(
+          invocation.binding);
   ArgSlotsBacking arg_slots_backing =
-      this->local_slots_backing.construct_arg_slots_backing(invocation.binding);
+      this->local_args_backing.construct_arg_slots_backing(invocation.binding);
   return TaskArgumentAccessor::create<LocalTaskArgumentAccessor>(
       this->allocator, tensor_slots_backing, arg_slots_backing);
 }
 
-TaskArgumentAccessor LocalTrainingBacking::get_op_task_arg_accessor(
-    OpTaskInvocation const &invocation, layer_guid_t const &op_guid) const {
-  TensorSlotsBacking tensor_slots_backing =
-      this->local_slots_backing.construct_tensor_slots_backing(
-          invocation.binding, op_guid);
-  ArgSlotsBacking arg_slots_backing =
-      this->local_slots_backing.construct_arg_slots_backing(invocation.binding,
-                                                            op_guid);
+TaskInvocation LocalTrainingBacking::lower_to_task_invocation(OpTaskInvocation const & op_task_invocation, layer_guid_t const & layer_guid) const {
+  TaskBinding binding;
+  // tensors
+  for (auto const & tensor_binding: op_task_invocation.binding.get_tensor_bindings()) {
+    tensor_guid_t tensor_to_bind = [&] {
+      switch (tensor_binding.second.role) {
+        case TensorRole::INPUT:
+          return get_incoming_inputs(this->computation_graph, layer_guid).at(tensor_binding.second.idx);
+        case TensorRole::OUTPUT:
+          return get_outgoing_tensors(this->computation_graph, layer_guid).at(tensor_binding.second.idx);
+        case TensorRole::WEIGHT:
+          return get_incoming_weights(this->computation_graph, layer_guid).at(tensor_binding.second.idx);
+        default:
+          throw mk_runtime_error(fmt::format("Invalid tensor role {}", tensor_binding.second.role));
+      }
+    }(); 
+
+    if (tensor_binding.first.is_grad == IsGrad::NO) {
+      binding.bind(tensor_binding.first.slot_id, tensor_to_bind);
+    } else if (tensor_binding.first.is_grad == IsGrad::YES) {
+      binding.bind_grad(tensor_binding.first.slot_id, tensor_to_bind);
+    } else {
+      throw mk_runtime_error(fmt::format("Invalid value for IsGrad {}", tensor_binding.first.is_grad));
+    }
+  }
 
-  return TaskArgumentAccessor::create<LocalTaskArgumentAccessor>(
-      this->allocator, tensor_slots_backing, arg_slots_backing);
+  // args
+  for (auto const & arg_binding: op_task_invocation.binding.get_arg_bindings()) {
+    if (arg_binding.second.has<OpArgRefSpec>()) {
+      ConcreteArgSpec concrete_arg = this->local_args_backing.lower_to_concrete_arg_spec(arg_binding.second.get<OpArgRefSpec>(), this->computation_graph, layer_guid);
+      binding.insert_arg_spec(arg_binding.first, TaskArgSpec{concrete_arg});
+    } else if (arg_binding.second.has<RuntimeArgRefSpec>()) {
+      binding.insert_arg_spec(arg_binding.first, TaskArgSpec{arg_binding.second.get<RuntimeArgRefSpec>()});
+    } else {
+      binding.insert_arg_spec(arg_binding.first, TaskArgSpec{arg_binding.second.get<ConcreteArgSpec>()});
+    }
+  }
+
+  return TaskInvocation{op_task_invocation.task_id, binding};
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc
index e54841acb5..bfb3c0a32b 100644
--- a/lib/local-execution/src/loss_functions.cc
+++ b/lib/local-execution/src/loss_functions.cc
@@ -24,8 +24,8 @@ enum Slots { LOGIT, LABEL, ATTRS, PROFILING };
 
 TaskSignature get_loss_bwd_signature() {
   TaskSignature sig = make_empty_task_signature();
-  add_slot(sig, LOGIT, TensorType::NON_GRAPH);
-  add_slot(sig, LABEL, TensorType::NON_GRAPH);
+  add_slot(sig, LOGIT, TensorType::FORWARD);
+  add_slot(sig, LABEL, TensorType::LOSS);
   add_slot(sig, LOGIT, TensorType::GRADIENT);
 
   add_arg_slot<LossAttrs>(sig, ATTRS);
@@ -34,12 +34,12 @@ TaskSignature get_loss_bwd_signature() {
 }
 
 TaskInvocation backward(LossAttrs const &attrs,
-                        reduced_tensor_t logit,
-                        reduced_tensor_t label) {
+                        tensor_guid_t logit,
+                        loss_tensor_t label) {
   TaskBinding b;
-  b.bind(LOGIT, TensorType::NON_GRAPH, logit);
-  b.bind(LABEL, TensorType::NON_GRAPH, label);
-  b.bind(LOGIT, TensorType::GRADIENT, logit);
+  b.bind(LOGIT, logit);
+  b.bind(LABEL, label);
+  b.bind_grad(LOGIT, logit);
 
   b.bind_arg(ATTRS, attrs);
   b.bind_arg(PROFILING, profiling_settings());
diff --git a/lib/local-execution/src/lowered_tensor_source.cc b/lib/local-execution/src/lowered_tensor_source.cc
new file mode 100644
index 0000000000..05960ff5e2
--- /dev/null
+++ b/lib/local-execution/src/lowered_tensor_source.cc
@@ -0,0 +1,13 @@
+#include "local-execution/lowered_tensor_source.h"
+
+namespace FlexFlow {
+
+size_t LoweredTensorSource::next_available_lowered_tensor_id = 0;
+
+LoweredTensorSource::LoweredTensorSource() {}
+
+lowered_tensor_t LoweredTensorSource::new_lowered_tensor() {
+  return lowered_tensor_t{LoweredTensorSource::next_available_lowered_tensor_id++};
+}
+
+}
diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc
index 4815de5e85..f57c5db73a 100644
--- a/lib/local-execution/src/model_training_instance.cc
+++ b/lib/local-execution/src/model_training_instance.cc
@@ -8,18 +8,14 @@ namespace FlexFlow {
 ModelTrainingInstance::ModelTrainingInstance(
     Allocator const &allocator,
     ComputationGraph const &computation_graph,
-    LayerTensorBackingMap const &allocated_forward_tensors,
-    TensorBackingMap const &allocated_non_graph_tensors,
     RuntimeArgConfig const &runtime_arg_config,
     LossAttrs const &loss_attrs,
-    reduced_tensor_t const &logit_tensor,
-    reduced_tensor_t const &label_tensor,
+    tensor_guid_t const &logit_tensor,
+    loss_tensor_t const &label_tensor,
     OptimizerAttrs const &optimizer_attrs)
     : computation_graph(computation_graph),
       training_backing(allocator,
                        computation_graph,
-                       allocated_forward_tensors,
-                       allocated_non_graph_tensors,
                        runtime_arg_config),
       loss_attrs(loss_attrs), logit_tensor(logit_tensor),
       label_tensor(label_tensor), optimizer_attrs(optimizer_attrs) {
diff --git a/lib/local-execution/src/op_task_invocation.cc b/lib/local-execution/src/op_task_invocation.cc
index 81bf185911..b6771e6eb8 100644
--- a/lib/local-execution/src/op_task_invocation.cc
+++ b/lib/local-execution/src/op_task_invocation.cc
@@ -21,7 +21,7 @@ void OpTaskBinding::bind(int slot, OpTensorSpec const &tensor_spec) {
 
 void OpTaskBinding::bind(slot_id_t slot, OpTensorSpec const &tensor_spec) {
   this->tensor_bindings.insert(
-      {SlotTensorTypeId{slot, TensorType::FORWARD}, tensor_spec});
+      {SlotGradId{slot, IsGrad::NO}, tensor_spec});
 }
 
 void OpTaskBinding::bind_grad(int slot, OpTensorSpec const &tensor_spec) {
@@ -30,7 +30,7 @@ void OpTaskBinding::bind_grad(int slot, OpTensorSpec const &tensor_spec) {
 
 void OpTaskBinding::bind_grad(slot_id_t slot, OpTensorSpec const &tensor_spec) {
   this->tensor_bindings.insert(
-      {SlotTensorTypeId{slot, TensorType::GRADIENT}, tensor_spec});
+      {SlotGradId{slot, IsGrad::YES}, tensor_spec});
 }
 
 void OpTaskBinding::insert_arg_spec(slot_id_t name, OpArgSpec const &arg_spec) {
@@ -46,13 +46,13 @@ bool OpTaskBinding::operator!=(OpTaskBinding const &other) const {
   return this->tie() != other.tie();
 }
 
-std::tuple<std::unordered_map<SlotTensorTypeId, OpTensorSpec> const &,
+std::tuple<std::unordered_map<SlotGradId, OpTensorSpec> const &,
            std::unordered_map<slot_id_t, OpArgSpec> const &>
     OpTaskBinding::tie() const {
   return std::tie(this->tensor_bindings, this->arg_bindings);
 }
 
-std::unordered_map<SlotTensorTypeId, OpTensorSpec> const &
+std::unordered_map<SlotGradId, OpTensorSpec> const &
     OpTaskBinding::get_tensor_bindings() const {
   return this->tensor_bindings;
 }
@@ -91,8 +91,8 @@ bool is_tensor_invocation_valid(OpTaskSignature const &sig,
                                 OpTaskInvocation const &inv) {
   auto tensor_bindings = inv.binding.get_tensor_bindings();
   for (OpTensorSlotSpec const &op_tensor_slot_spec : sig.get_tensor_slots()) {
-    SlotTensorTypeId tensor_key = SlotTensorTypeId{
-        op_tensor_slot_spec.name, op_tensor_slot_spec.tensor_type};
+    SlotGradId tensor_key = SlotGradId{
+        op_tensor_slot_spec.name, op_tensor_slot_spec.is_grad};
     OpTensorSpec op_tensor_spec = tensor_bindings.at(tensor_key);
     if (is_op_tensor_spec_invalid(op_tensor_slot_spec, op_tensor_spec)) {
       return false;
diff --git a/lib/local-execution/src/op_task_signature.cc b/lib/local-execution/src/op_task_signature.cc
index 5c8b19265a..69b5463a0d 100644
--- a/lib/local-execution/src/op_task_signature.cc
+++ b/lib/local-execution/src/op_task_signature.cc
@@ -16,7 +16,7 @@ void OpTaskSignature::add_input_slot(slot_id_t name, SlotType slot_type) {
       OpTensorSlotSpec{name,
                        slot_type,
                        TensorRole::INPUT,
-                       TensorType::FORWARD,
+                       IsGrad::NO,
                        OpSlotOptions::NECESSARY};
   this->op_tensor_slots.insert(op_tensor_slot_spec);
 }
@@ -31,7 +31,7 @@ void OpTaskSignature::add_optional_input_slot(slot_id_t name,
       OpTensorSlotSpec{name,
                        slot_type,
                        TensorRole::INPUT,
-                       TensorType::FORWARD,
+                       IsGrad::NO,
                        OpSlotOptions::OPTIONAL};
   this->op_tensor_slots.insert(op_tensor_slot_spec);
 }
@@ -46,7 +46,7 @@ void OpTaskSignature::add_untrainable_input_slot(slot_id_t name,
       OpTensorSlotSpec{name,
                        slot_type,
                        TensorRole::INPUT,
-                       TensorType::FORWARD,
+                       IsGrad::NO,
                        OpSlotOptions::UNTRAINABLE};
   this->op_tensor_slots.insert(op_tensor_slot_spec);
 }
@@ -62,7 +62,7 @@ void OpTaskSignature::add_optional_untrainable_input_slot(slot_id_t name,
       OpTensorSlotSpec{name,
                        slot_type,
                        TensorRole::INPUT,
-                       TensorType::FORWARD,
+                       IsGrad::NO,
                        OpSlotOptions::OPTIONAL_UNTRAINABLE};
   this->op_tensor_slots.insert(op_tensor_slot_spec);
 }
@@ -76,7 +76,7 @@ void OpTaskSignature::add_output_slot(slot_id_t name, SlotType slot_type) {
       OpTensorSlotSpec{name,
                        slot_type,
                        TensorRole::OUTPUT,
-                       TensorType::FORWARD,
+                       IsGrad::NO,
                        OpSlotOptions::NECESSARY};
   this->op_tensor_slots.insert(op_tensor_slot_spec);
 }
@@ -92,7 +92,7 @@ void OpTaskSignature::add_bwd_optional_output_slot(slot_id_t name,
       OpTensorSlotSpec{name,
                        slot_type,
                        TensorRole::OUTPUT,
-                       TensorType::FORWARD,
+                       IsGrad::NO,
                        OpSlotOptions::OPTIONAL};
   this->op_tensor_slots.insert(op_tensor_slot_spec);
 }
@@ -106,7 +106,7 @@ void OpTaskSignature::add_weight_slot(slot_id_t name, SlotType slot_type) {
       OpTensorSlotSpec{name,
                        slot_type,
                        TensorRole::WEIGHT,
-                       TensorType::FORWARD,
+                       IsGrad::NO,
                        OpSlotOptions::NECESSARY};
   this->op_tensor_slots.insert(op_tensor_slot_spec);
 }
@@ -121,7 +121,7 @@ void OpTaskSignature::add_optional_weight_slot(slot_id_t name,
       OpTensorSlotSpec{name,
                        slot_type,
                        TensorRole::WEIGHT,
-                       TensorType::FORWARD,
+                       IsGrad::NO,
                        OpSlotOptions::OPTIONAL};
   this->op_tensor_slots.insert(op_tensor_slot_spec);
 }
@@ -146,7 +146,7 @@ OpTaskSignature infer_bwd_signature(OpTaskSignature const &fwd) {
           OpTensorSlotSpec{op_tensor_slot_spec.name,
                            op_tensor_slot_spec.slot_type,
                            op_tensor_slot_spec.tensor_role,
-                           TensorType::GRADIENT,
+                           IsGrad::YES,
                            op_tensor_slot_spec.slot_option};
       bwd.op_tensor_slots.insert(grad_spec);
     }
diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc
index 5c0d6c54f2..94584dfc95 100644
--- a/lib/local-execution/src/optimizer.cc
+++ b/lib/local-execution/src/optimizer.cc
@@ -22,14 +22,14 @@ TaskSignature get_sgd_update_signature() {
 }
 
 TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs,
-                          reduced_tensor_t const &weight,
-                          reduced_tensor_t const &sgd_v) {
+                          tensor_guid_t const &weight,
+                          optimizer_tensor_t const &sgd_v) {
   TaskBinding b;
-  b.bind(WEIGHT, TensorType::FORWARD, weight);
-  b.bind(WEIGHT, TensorType::GRADIENT, weight);
+  b.bind(WEIGHT, weight);
+  b.bind_grad(WEIGHT, weight);
 
   if (attrs.momentum > 0.0f) {
-    b.bind(SGD_V, TensorType::OPTIMIZER, sgd_v);
+    b.bind(SGD_V, sgd_v);
   }
   b.bind_arg(ATTRS, attrs);
   b.bind_arg(PROFILING, profiling_settings());
@@ -113,14 +113,14 @@ TaskSignature get_adam_update_signature() {
 }
 
 TaskInvocation adam_update(AdamOptimizerAttrs const &attrs,
-                           reduced_tensor_t const &weight,
-                           reduced_tensor_t const &adam_v,
-                           reduced_tensor_t const &adam_m) {
+                           tensor_guid_t const &weight,
+                           optimizer_tensor_t const &adam_v,
+                           optimizer_tensor_t const &adam_m) {
   TaskBinding b;
-  b.bind(WEIGHT, TensorType::FORWARD, weight);
-  b.bind(WEIGHT, TensorType::GRADIENT, weight);
-  b.bind(ADAM_M, TensorType::OPTIMIZER, adam_m);
-  b.bind(ADAM_V, TensorType::OPTIMIZER, adam_v);
+  b.bind(WEIGHT, weight);
+  b.bind_grad(WEIGHT, weight);
+  b.bind(ADAM_M, adam_m);
+  b.bind(ADAM_V, adam_v);
   b.bind_arg(ATTRS, attrs);
   b.bind_arg(PROFILING, profiling_settings());
 
@@ -194,8 +194,8 @@ TaskSignature get_update_signature(OptimizerAttrs const &attrs) {
 
 TaskInvocation get_update_invocation(
     OptimizerAttrs const &attrs,
-    reduced_tensor_t const &weight,
-    std::vector<reduced_tensor_t> const &grad_buffer_tensors) {
+    tensor_guid_t const &weight,
+    std::vector<optimizer_tensor_t> const &grad_buffer_tensors) {
   return attrs.visit<TaskInvocation>(overload{
       [&](SGDOptimizerAttrs const &s) {
         return sgd_update(s, weight, grad_buffer_tensors.at(0));
diff --git a/lib/local-execution/src/optimizer_tensor_source.cc b/lib/local-execution/src/optimizer_tensor_source.cc
new file mode 100644
index 0000000000..8adb8ec07b
--- /dev/null
+++ b/lib/local-execution/src/optimizer_tensor_source.cc
@@ -0,0 +1,13 @@
+#include "local-execution/optimizer_tensor_source.h"
+
+namespace FlexFlow {
+
+size_t OptimizerTensorSource::next_available_optimizer_tensor_id = 0;
+
+OptimizerTensorSource::OptimizerTensorSource() {}
+
+optimizer_tensor_t OptimizerTensorSource::new_optimizer_tensor() {
+  return optimizer_tensor_t{OptimizerTensorSource::next_available_optimizer_tensor_id++};
+}
+
+}
diff --git a/lib/local-execution/src/task_binding.cc b/lib/local-execution/src/task_binding.cc
index 2b1256df90..6fc8449f0b 100644
--- a/lib/local-execution/src/task_binding.cc
+++ b/lib/local-execution/src/task_binding.cc
@@ -2,19 +2,48 @@
 #include "utils/containers/contains_key.h"
 #include "utils/fmt/unordered_map.h"
 #include "utils/hash/unordered_map.h"
+#include "pcg/tensor_guid_t.dtg.h"
 
 namespace FlexFlow {
 
 void TaskBinding::bind(int name,
-                       TensorType const &tensor_type,
-                       reduced_tensor_t const &binding) {
-  this->bind(slot_id_t{name}, tensor_type, binding);
+                       tensor_guid_t const &binding) {
+  this->bind(slot_id_t{name}, binding);
 }
 
 void TaskBinding::bind(slot_id_t name,
-                       TensorType const &tensor_type,
-                       reduced_tensor_t const &binding) {
-  this->tensor_bindings.insert({SlotTensorTypeId{name, tensor_type}, binding});
+                       tensor_guid_t const &binding) {
+  this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::FORWARD}, TensorTypeVariant{binding}});
+}
+
+void TaskBinding::bind_grad(int name,
+                       tensor_guid_t const &binding) {
+  this->bind(slot_id_t{name}, binding);
+}
+
+void TaskBinding::bind_grad(slot_id_t name,
+                       tensor_guid_t const &binding) {
+  this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::GRADIENT}, TensorTypeVariant{binding}});
+}
+
+void TaskBinding::bind(int name,
+                       optimizer_tensor_t const &binding) {
+  this->bind(slot_id_t{name}, binding);
+}
+
+void TaskBinding::bind(slot_id_t name,
+                       optimizer_tensor_t const &binding) {
+  this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::OPTIMIZER}, TensorTypeVariant{binding}});
+}
+
+void TaskBinding::bind(int name,
+                       loss_tensor_t const &binding) {
+  this->bind(slot_id_t{name}, binding);
+}
+
+void TaskBinding::bind(slot_id_t name,
+                       loss_tensor_t const &binding) {
+  this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::LOSS}, TensorTypeVariant{binding}});
 }
 
 void TaskBinding::insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec) {
@@ -30,13 +59,13 @@ bool TaskBinding::operator!=(TaskBinding const &other) const {
   return this->tie() != other.tie();
 }
 
-std::tuple<std::unordered_map<SlotTensorTypeId, reduced_tensor_t> const &,
+std::tuple<std::unordered_map<SlotTensorTypeId, TensorTypeVariant> const &,
            std::unordered_map<slot_id_t, TaskArgSpec> const &>
     TaskBinding::tie() const {
   return std::tie(this->tensor_bindings, this->arg_bindings);
 }
 
-std::unordered_map<SlotTensorTypeId, reduced_tensor_t> const &
+std::unordered_map<SlotTensorTypeId, TensorTypeVariant> const &
     TaskBinding::get_tensor_bindings() const {
   return this->tensor_bindings;
 }
diff --git a/lib/local-execution/src/tensor_lowering.cc b/lib/local-execution/src/tensor_lowering.cc
new file mode 100644
index 0000000000..63be366d94
--- /dev/null
+++ b/lib/local-execution/src/tensor_lowering.cc
@@ -0,0 +1,10 @@
+#include "local-execution/tensor_lowering.h"
+#include "utils/containers/transform.h"
+
+namespace FlexFlow {
+
+lowered_tensor_t lower(tensor_guid_t const &tensor_guid) {
+  return lowered_tensor_t{tensor_guid.raw_graph_output.node.raw_uid};
+}
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/tensor_reduction.cc b/lib/local-execution/src/tensor_reduction.cc
deleted file mode 100644
index ae5b188dfd..0000000000
--- a/lib/local-execution/src/tensor_reduction.cc
+++ /dev/null
@@ -1,10 +0,0 @@
-#include "local-execution/tensor_reduction.h"
-#include "utils/containers/transform.h"
-
-namespace FlexFlow {
-
-reduced_tensor_t lower(tensor_guid_t const &tensor_guid) {
-  return reduced_tensor_t{tensor_guid.raw_graph_output.node.raw_uid};
-}
-
-} // namespace FlexFlow

From 66d61eb11c1f6566be7b005261d0a7b29f0fc4dc Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 21 Jan 2025 02:49:27 -0800
Subject: [PATCH 26/91] feat: add realm-backend subdir

---
 .proj.toml                            |  1 +
 lib/realm-backend/CMakeLists.txt      | 20 ++++++++++++++++++++
 lib/realm-backend/test/CMakeLists.txt | 14 ++++++++++++++
 3 files changed, 35 insertions(+)
 create mode 100644 lib/realm-backend/CMakeLists.txt
 create mode 100644 lib/realm-backend/test/CMakeLists.txt

diff --git a/.proj.toml b/.proj.toml
index 10307a6efa..c895fcecc4 100644
--- a/.proj.toml
+++ b/.proj.toml
@@ -12,6 +12,7 @@ build_targets = [
   "compiler",
   "substitution-generator",
   "local-execution",
+  "realm-backend",
   "models",
   "export-model-arch",
   "substitution-to-dot",
diff --git a/lib/realm-backend/CMakeLists.txt b/lib/realm-backend/CMakeLists.txt
new file mode 100644
index 0000000000..623816567e
--- /dev/null
+++ b/lib/realm-backend/CMakeLists.txt
@@ -0,0 +1,20 @@
+ff_add_library(
+  NAME
+    realm-backend
+  SRC_PATTERNS
+    src/*.cc
+  PUBLIC_INCLUDE
+    include/
+  PRIVATE_INCLUDE
+    src/
+  DEPS
+    op-attrs
+    utils
+    kernels
+    local-execution
+    pcg
+    spdlog
+    legion
+)
+
+add_subdirectory(test)
diff --git a/lib/realm-backend/test/CMakeLists.txt b/lib/realm-backend/test/CMakeLists.txt
new file mode 100644
index 0000000000..965f2e04b2
--- /dev/null
+++ b/lib/realm-backend/test/CMakeLists.txt
@@ -0,0 +1,14 @@
+ff_add_test_executable(
+  NAME
+    realm-backend-tests
+  SRC_PATTERNS
+    src/*.cc
+  PRIVATE_INCLUDE 
+    src/
+  DEPS
+    doctest
+    utils-test-common
+    realm-backend
+    kernels
+    op-attrs
+)

From 411017d25dd01e1222d8b9bbff5dd8a441ec745e Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Tue, 21 Jan 2025 17:45:26 -0800
Subject: [PATCH 27/91] Build local exec

---
 .../local-execution/task_argument_accessor.h  | 40 +++++++++----------
 .../src/local_cost_estimator.cc               |  2 -
 .../src/local_training_backing.cc             |  6 +--
 3 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h
index 29d5fb8fbe..8b8516045d 100644
--- a/lib/local-execution/include/local-execution/task_argument_accessor.h
+++ b/lib/local-execution/include/local-execution/task_argument_accessor.h
@@ -59,16 +59,16 @@ struct TaskArgumentAccessor {
         this->ptr->get_tensor(slot, PRIV, TensorType::OPTIMIZER));
   }
 
-  template <Permissions PRIV>
-  privilege_mode_to_accessor<PRIV> get_non_graph_tensor(int slot) const {
-    return this->get_tensor_grad<PRIV>(slot_id_t{slot});
-  }
+  // template <Permissions PRIV>
+  // privilege_mode_to_accessor<PRIV> get_non_graph_tensor(int slot) const {
+  //   return this->get_tensor_grad<PRIV>(slot_id_t{slot});
+  // }
 
-  template <Permissions PRIV>
-  privilege_mode_to_accessor<PRIV> get_non_graph_tensor(slot_id_t slot) const {
-    return std::get<privilege_mode_to_accessor<PRIV>>(
-        this->ptr->get_tensor(slot, PRIV, TensorType::NON_GRAPH));
-  }
+  // template <Permissions PRIV>
+  // privilege_mode_to_accessor<PRIV> get_non_graph_tensor(slot_id_t slot) const {
+  //   return std::get<privilege_mode_to_accessor<PRIV>>(
+  //       this->ptr->get_tensor(slot, PRIV, TensorType::NON_GRAPH));
+  // }
 
   // variadic tensors
   template <Permissions PRIV>
@@ -110,18 +110,18 @@ struct TaskArgumentAccessor {
         this->ptr->get_variadic_tensor(slot, PRIV, TensorType::OPTIMIZER));
   }
 
-  template <Permissions PRIV>
-  std::vector<privilege_mode_to_accessor<PRIV>>
-      get_variadic_non_graph_tensor(int slot) const {
-    return this->get_variadic_tensor_grad<PRIV>(slot_id_t{slot});
-  }
+  // template <Permissions PRIV>
+  // std::vector<privilege_mode_to_accessor<PRIV>>
+  //     get_variadic_non_graph_tensor(int slot) const {
+  //   return this->get_variadic_tensor_grad<PRIV>(slot_id_t{slot});
+  // }
 
-  template <Permissions PRIV>
-  std::vector<privilege_mode_to_accessor<PRIV>>
-      get_variadic_non_graph_tensor(slot_id_t slot) const {
-    return std::get<std::vector<privilege_mode_to_accessor<PRIV>>>(
-        this->ptr->get_variadic_tensor(slot, PRIV, TensorType::NON_GRAPH));
-  }
+  // template <Permissions PRIV>
+  // std::vector<privilege_mode_to_accessor<PRIV>>
+  //     get_variadic_non_graph_tensor(slot_id_t slot) const {
+  //   return std::get<std::vector<privilege_mode_to_accessor<PRIV>>>(
+  //       this->ptr->get_variadic_tensor(slot, PRIV, TensorType::NON_GRAPH));
+  // }
 
   Allocator get_allocator() const {
     return this->ptr->get_allocator();
diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
index 404064b7ce..b416378e66 100644
--- a/lib/local-execution/src/local_cost_estimator.cc
+++ b/lib/local-execution/src/local_cost_estimator.cc
@@ -66,8 +66,6 @@ CostDetails LocalCostEstimator::estimate_cost(
 
   LocalTrainingBacking local_backing(allocator,
                                      cg_builder.computation_graph,
-                                     LayerTensorBackingMap{},
-                                     TensorBackingMap{},
                                      this->runtime_arg_config);
   local_backing.register_and_allocate_layer(layer_added_result.layer);
   local_backing.execute_init(layer_added_result.layer);
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index 9b933dee9c..22dc3b8397 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -76,7 +76,7 @@ void LocalTrainingBacking::execute_init(layer_guid_t const &operator_node) {
     ComputationGraphOpAttrs attrs =
         get_layer_attrs(this->computation_graph, operator_node).attrs;
 
-    TaskInvocation invocation = this->lower_to_task_invocation(init(attrs));
+    TaskInvocation invocation = this->lower_to_task_invocation(init(attrs), operator_node);
     TaskArgumentAccessor accessor =
         this->get_task_arg_accessor(invocation);
     DeviceSpecificDeviceStates device_state =
@@ -93,7 +93,7 @@ std::optional<float>
     ComputationGraphOpAttrs attrs =
         get_layer_attrs(this->computation_graph, operator_node).attrs;
 
-    TaskInvocation invocation = this->lower_to_task_invocation(forward(attrs));
+    TaskInvocation invocation = this->lower_to_task_invocation(forward(attrs), operator_node);
     TaskArgumentAccessor accessor =
         this->get_task_arg_accessor(invocation);
     return this->call_task_impl(invocation.task_id, accessor);
@@ -122,7 +122,7 @@ std::optional<float>
     ComputationGraphOpAttrs attrs =
         get_layer_attrs(this->computation_graph, operator_node).attrs;
 
-    TaskInvocation invocation = this->lower_to_task_invocation(backward(attrs));
+    TaskInvocation invocation = this->lower_to_task_invocation(backward(attrs), operator_node);
     TaskArgumentAccessor accessor =
         this->get_task_arg_accessor(invocation);
     return this->call_task_impl(invocation.task_id, accessor);

From bcd1408de85562e3bc2f9aea1427c38f5c4eeffd Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 22 Jan 2025 01:25:52 -0800
Subject: [PATCH 28/91] chore: duplicate some files from local-execution

---
 .proj.toml                                    |   1 +
 lib/CMakeLists.txt                            |   1 +
 lib/realm-backend/CMakeLists.txt              |   2 +-
 .../realm-backend/realm_args_backing.h        |  37 ++++
 .../realm_task_argument_accessor.h            |  55 +++++
 .../realm-backend/realm_tensor_backing.h      |  58 +++++
 .../realm-backend/realm_training_backing.h    |  58 +++++
 .../src/realm_training_backing.cc             | 209 ++++++++++++++++++
 8 files changed, 420 insertions(+), 1 deletion(-)
 create mode 100644 lib/realm-backend/include/realm-backend/realm_args_backing.h
 create mode 100644 lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h
 create mode 100644 lib/realm-backend/include/realm-backend/realm_tensor_backing.h
 create mode 100644 lib/realm-backend/include/realm-backend/realm_training_backing.h
 create mode 100644 lib/realm-backend/src/realm_training_backing.cc

diff --git a/.proj.toml b/.proj.toml
index c895fcecc4..c1612ce918 100644
--- a/.proj.toml
+++ b/.proj.toml
@@ -27,6 +27,7 @@ test_targets = [
   "compiler-tests",
   "substitution-generator-tests",
   "local-execution-tests",
+  #"realm-backend-tests",
   "models-tests",
 ]
 
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 972c656126..136bb29528 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -4,6 +4,7 @@ add_subdirectory(runtime)
 add_subdirectory(op-attrs)
 add_subdirectory(kernels)
 add_subdirectory(local-execution)
+add_subdirectory(realm-backend)
 add_subdirectory(utils)
 add_subdirectory(ffi)
 add_subdirectory(substitutions)
diff --git a/lib/realm-backend/CMakeLists.txt b/lib/realm-backend/CMakeLists.txt
index 623816567e..436d8cc8b0 100644
--- a/lib/realm-backend/CMakeLists.txt
+++ b/lib/realm-backend/CMakeLists.txt
@@ -17,4 +17,4 @@ ff_add_library(
     legion
 )
 
-add_subdirectory(test)
+# add_subdirectory(test)
diff --git a/lib/realm-backend/include/realm-backend/realm_args_backing.h b/lib/realm-backend/include/realm-backend/realm_args_backing.h
new file mode 100644
index 0000000000..626698cba6
--- /dev/null
+++ b/lib/realm-backend/include/realm-backend/realm_args_backing.h
@@ -0,0 +1,37 @@
+#ifndef _FLEXFLOW_REALM_BACKEND_REALM_ARGS_BACKING_H
+#define _FLEXFLOW_REALM_BACKEND_REALM_ARGS_BACKING_H
+
+#include "pcg/layer_guid_t.dtg.h"
+#include "pcg/computation_graph.h"
+#include "local-execution/per_device_op_state.h"
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/runtime_arg_config.h"
+#include "local-execution/task_invocation.dtg.h"
+#include "realm-backend/realm_task_argument_accessor.h"
+
+namespace FlexFlow {
+
+struct LocalArgsBacking {
+  LocalArgsBacking(RuntimeArgConfig const &);
+
+public:
+  void add_per_device_op_state(layer_guid_t const &,
+                               DeviceSpecificDeviceStates const &);
+
+  ArgSlotsBacking construct_arg_slots_backing(TaskBinding const &) const;
+
+  ConcreteArgSpec lower_to_concrete_arg_spec(RuntimeArgRefSpec const &) const;
+  ConcreteArgSpec lower_to_concrete_arg_spec(OpArgRefSpec const &,
+                                             ComputationGraph const &,
+                                             layer_guid_t const &) const;
+
+public:
+  // arguments
+  std::unordered_map<layer_guid_t, DeviceSpecificDeviceStates>
+      per_device_op_states;
+  RuntimeArgConfig runtime_arg_config;
+};
+
+}
+
+#endif
diff --git a/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h b/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h
new file mode 100644
index 0000000000..ca4bc9db02
--- /dev/null
+++ b/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h
@@ -0,0 +1,55 @@
+#ifndef _FLEXFLOW_REALM_BACKEND_REALM_TASK_ARGUMENT_ACCESSOR_H
+#define _FLEXFLOW_REALM_BACKEND_REALM_TASK_ARGUMENT_ACCESSOR_H
+
+#include "local-execution/slot_tensor_type_id.dtg.h"
+#include "local-execution/task_argument_accessor.h"
+#include <unordered_map>
+#include <variant>
+
+namespace FlexFlow {
+
+using TensorSlotsBacking = std::unordered_map<
+    SlotTensorTypeId,
+    std::variant<GenericTensorAccessorW, std::vector<GenericTensorAccessorW>>>;
+using ArgSlotsBacking = std::unordered_map<slot_id_t, ConcreteArgSpec>;
+
+struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor {
+  LocalTaskArgumentAccessor(Allocator const &allocator,
+                            TensorSlotsBacking const &tensor_slots_backing,
+                            ArgSlotsBacking const &arg_slots_backing);
+
+  LocalTaskArgumentAccessor(LocalTaskArgumentAccessor const &) = delete;
+  LocalTaskArgumentAccessor(LocalTaskArgumentAccessor &&) = delete;
+
+  ConcreteArgSpec const &get_concrete_arg(slot_id_t) const override;
+
+  GenericTensorAccessor get_tensor(slot_id_t slot,
+                                   Permissions priv,
+                                   TensorType tensor_type) const override;
+  VariadicGenericTensorAccessor get_variadic_tensor(
+      slot_id_t slot, Permissions priv, TensorType tensor_type) const override;
+
+  Allocator get_allocator() const override;
+
+  size_t get_device_idx() const override;
+
+private:
+  Allocator allocator;
+  TensorSlotsBacking tensor_slots_backing;
+  ArgSlotsBacking arg_slots_backing;
+};
+
+using TensorSlotsBackingWithoutAddresses = std::unordered_map<
+    SlotTensorTypeId,
+    std::variant<std::pair<ArrayShape, DataType>,
+                 std::vector<std::pair<ArrayShape, DataType>>>>;
+
+TensorSlotsBackingWithoutAddresses
+    get_slots_backing_without_tensor_allocation_addresses(
+        TensorSlotsBacking const &);
+
+CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalTaskArgumentAccessor);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-backend/include/realm-backend/realm_tensor_backing.h b/lib/realm-backend/include/realm-backend/realm_tensor_backing.h
new file mode 100644
index 0000000000..2d9fa0bbdf
--- /dev/null
+++ b/lib/realm-backend/include/realm-backend/realm_tensor_backing.h
@@ -0,0 +1,58 @@
+
+#ifndef _FLEXFLOW_REALM_BACKEND_REALM_TENSOR_BACKING_H
+#define _FLEXFLOW_REALM_BACKEND_REALM_TENSOR_BACKING_H
+
+#include "kernels/accessor.h"
+#include "realm-backend/realm_task_argument_accessor.h"
+#include "local-execution/task_invocation.dtg.h"
+#include "local-execution/tensor_role.dtg.h"
+#include "local-execution/lowered_tensor_t.dtg.h"
+#include "local-execution/lowered_tensor_source.h"
+#include "local-execution/optimizer_tensor_t.dtg.h"
+#include "local-execution/loss_tensor_t.dtg.h"
+#include "pcg/computation_graph.dtg.h"
+#include "pcg/tensor_guid_t.dtg.h"
+#include "pcg/layer_guid_t.dtg.h"
+
+namespace FlexFlow {
+
+using TensorBackingMap =
+    std::unordered_map<lowered_tensor_t, GenericTensorAccessorW>;
+
+struct LocalTensorBacking {
+  LocalTensorBacking();
+
+public:
+  void allocate_layer_tensors(layer_guid_t const &,
+                              ComputationGraph const &,
+                              Allocator &);
+  void allocate_tensors_by_role(TensorRole const &,
+                                layer_guid_t const &,
+                                ComputationGraph const &,
+                                Allocator &);
+  void allocate_optimizer_tensors(tensor_guid_t const &,
+                                  std::vector<optimizer_tensor_t> const &,
+                                  Allocator &);
+  TensorSlotsBacking
+      construct_tensor_slots_backing(TaskBinding const &) const;
+
+  GenericTensorAccessorW const &
+      get_tensor_backing(lowered_tensor_t const &) const;
+
+  bool is_tensor_allocated(lowered_tensor_t const &) const;
+
+public:
+  // tensors
+  TensorBackingMap tensor_backings;
+  
+  std::unordered_map<tensor_guid_t, lowered_tensor_t> tensor_lowering_mapping;
+  std::unordered_map<tensor_guid_t, lowered_tensor_t> gradient_tensor_lowering_mapping;
+  std::unordered_map<optimizer_tensor_t, lowered_tensor_t> optimizer_tensor_lowering_mapping;
+  std::unordered_map<loss_tensor_t, lowered_tensor_t> loss_tensor_lowering_mapping;
+
+  LoweredTensorSource lowered_tensor_source;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-backend/include/realm-backend/realm_training_backing.h b/lib/realm-backend/include/realm-backend/realm_training_backing.h
new file mode 100644
index 0000000000..e5385a93c3
--- /dev/null
+++ b/lib/realm-backend/include/realm-backend/realm_training_backing.h
@@ -0,0 +1,58 @@
+#ifndef _FLEXFLOW_REALM_BACKEND_REALM_TRAINING_BACKING_H
+#define _FLEXFLOW_REALM_BACKEND_REALM_TRAINING_BACKING_H
+
+#include "realm-backend/realm_tensor_backing.h"
+#include "realm-backend/realm_args_backing.h"
+#include "local-execution/task_registry.h"
+#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
+#include "pcg/computation_graph.dtg.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "local-execution/optimizer_tensor_source.h"
+
+namespace FlexFlow {
+
+using PerLayerElapsedTime =
+    std::unordered_map<layer_guid_t, std::optional<float>>;
+
+struct LocalTrainingBacking {
+  LocalTrainingBacking(Allocator const &,
+                       ComputationGraph const &,
+                       RuntimeArgConfig const &);
+  void register_and_allocate_layer(layer_guid_t const &);
+  void allocate_layer_optimizer_tensors(layer_guid_t const &,
+                                        OptimizerAttrs const &);
+
+  void execute_init(layer_guid_t const &);
+  std::optional<float> execute_forward(layer_guid_t const &);
+  void compute_loss(LossAttrs const &loss_attrs,
+                    tensor_guid_t const &logit_tensor,
+                    loss_tensor_t const &label_tensor);
+  std::optional<float> execute_backward(layer_guid_t const &);
+  void execute_update(layer_guid_t const &, OptimizerAttrs const &);
+
+  TaskArgumentAccessor
+      get_task_arg_accessor(TaskInvocation const &) const;
+
+  TaskInvocation lower_to_task_invocation(OpTaskInvocation const &, layer_guid_t const &) const;
+
+  LocalTensorBacking local_tensor_backing;
+  LocalArgsBacking local_args_backing;
+
+private:
+  DeviceSpecificDeviceStates call_init_task_impl(task_id_t,
+                                                 TaskArgumentAccessor const &);
+  std::optional<float> call_task_impl(task_id_t, TaskArgumentAccessor);
+
+private:
+  Allocator allocator;
+  ComputationGraph computation_graph;
+  TaskRegistry task_registry;
+
+  // optimizer
+  OptimizerTensorSource optimizer_tensor_source;
+  std::unordered_map<layer_guid_t, std::vector<optimizer_tensor_t>> layer_optimizer_tensor_ids;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc
new file mode 100644
index 0000000000..46efb17bc1
--- /dev/null
+++ b/lib/realm-backend/src/realm_training_backing.cc
@@ -0,0 +1,209 @@
+#include "realm-backend/realm_training_backing.h"
+#include "local-execution/loss_functions.h"
+#include "local-execution/optimizer.h"
+#include "local-execution/task_invocation.h"
+#include "local-execution/task_signature_impl.h"
+#include "local-execution/tensor_lowering.h"
+#include "pcg/computation_graph.h"
+#include "pcg/optimizer_attrs.h"
+#include "utils/containers/contains.h"
+#include "utils/containers/contains_key.h"
+#include "utils/containers/get_only.h"
+#include "utils/containers/values.h"
+#include "utils/exception.h"
+
+namespace FlexFlow {
+
+LocalTrainingBacking::LocalTrainingBacking(
+    Allocator const &allocator,
+    ComputationGraph const &computation_graph,
+    RuntimeArgConfig const &runtime_arg_config)
+    : allocator(allocator), computation_graph(computation_graph),
+      local_args_backing(runtime_arg_config),
+      task_registry(empty_task_registry()) {};
+
+void LocalTrainingBacking::register_and_allocate_layer(
+    layer_guid_t const &node) {
+  ComputationGraphOpAttrs attrs =
+      get_layer_attrs(this->computation_graph, node).attrs;
+  this->local_tensor_backing.allocate_layer_tensors(
+      node, this->computation_graph, this->allocator);
+  register_tasks_for_layer(this->task_registry, node, attrs);
+}
+
+void LocalTrainingBacking::allocate_layer_optimizer_tensors(
+    layer_guid_t const &node, OptimizerAttrs const &optimizer_attrs) {
+  ComputationGraphOpAttrs attrs =
+      get_layer_attrs(this->computation_graph, node).attrs;
+  if (attrs.has<WeightAttrs>()) {
+    TaskSignature sig = get_update_signature(optimizer_attrs);
+    tensor_guid_t weight_tensor =
+        get_only(get_outgoing_tensors(this->computation_graph, node));
+
+    std::vector<optimizer_tensor_t> optimizer_tensors;
+    for (TensorTypeSlotSpec const & tensor_type_slot_spec: values(sig.tensor_guid_slots)) {
+      optimizer_tensors.push_back(this->optimizer_tensor_source.new_optimizer_tensor());
+    }
+    this->layer_optimizer_tensor_ids.insert({node, optimizer_tensors});
+    this->local_tensor_backing.allocate_optimizer_tensors(
+        weight_tensor, optimizer_tensors, this->allocator);
+  }
+}
+
+DeviceSpecificDeviceStates
+    LocalTrainingBacking::call_init_task_impl(task_id_t task_id,
+                                              TaskArgumentAccessor const &acc) {
+  TaskSignatureAndImpl task_sig_impl =
+      this->task_registry.task_mapping.at(task_id);
+  auto fn =
+      task_sig_impl.impl_function.get<InitOpTaskImplFunction>().function_ptr;
+  return fn(acc);
+}
+
+std::optional<float>
+    LocalTrainingBacking::call_task_impl(task_id_t task_id,
+                                         TaskArgumentAccessor acc) {
+  TaskSignatureAndImpl task_sig_impl =
+      this->task_registry.task_mapping.at(task_id);
+  auto fn =
+      task_sig_impl.impl_function.get<FwdBwdOpTaskImplFunction>().function_ptr;
+  return fn(acc);
+}
+
+void LocalTrainingBacking::execute_init(layer_guid_t const &operator_node) {
+  if (registry_contains_task_for_layer(
+          this->task_registry, operator_node, OpTaskType::INIT)) {
+    ComputationGraphOpAttrs attrs =
+        get_layer_attrs(this->computation_graph, operator_node).attrs;
+
+    TaskInvocation invocation = this->lower_to_task_invocation(init(attrs));
+    TaskArgumentAccessor accessor =
+        this->get_task_arg_accessor(invocation);
+    DeviceSpecificDeviceStates device_state =
+        this->call_init_task_impl(invocation.task_id, accessor);
+    this->local_args_backing.add_per_device_op_state(operator_node,
+                                                      device_state);
+  }
+}
+
+std::optional<float>
+    LocalTrainingBacking::execute_forward(layer_guid_t const &operator_node) {
+  if (registry_contains_task_for_layer(
+          this->task_registry, operator_node, OpTaskType::FWD)) {
+    ComputationGraphOpAttrs attrs =
+        get_layer_attrs(this->computation_graph, operator_node).attrs;
+
+    TaskInvocation invocation = this->lower_to_task_invocation(forward(attrs));
+    TaskArgumentAccessor accessor =
+        this->get_task_arg_accessor(invocation);
+    return this->call_task_impl(invocation.task_id, accessor);
+  } else {
+    return std::nullopt;
+  }
+}
+
+void LocalTrainingBacking::compute_loss(LossAttrs const &loss_attrs,
+                                        tensor_guid_t const &logit_tensor,
+                                        loss_tensor_t const &label_tensor) {
+  TaskInvocation loss_invocation =
+      backward(loss_attrs, logit_tensor, label_tensor);
+  // TODO: https://github.com/flexflow/flexflow-train/issues/1442
+  // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation));
+  TaskArgumentAccessor loss_accessor =
+      this->get_task_arg_accessor(loss_invocation);
+  TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl();
+  loss_impl_fn.get<GenericTaskImplFunction>().function_ptr(loss_accessor);
+}
+
+std::optional<float>
+    LocalTrainingBacking::execute_backward(layer_guid_t const &operator_node) {
+  if (registry_contains_task_for_layer(
+          this->task_registry, operator_node, OpTaskType::BWD)) {
+    ComputationGraphOpAttrs attrs =
+        get_layer_attrs(this->computation_graph, operator_node).attrs;
+
+    TaskInvocation invocation = this->lower_to_task_invocation(backward(attrs));
+    TaskArgumentAccessor accessor =
+        this->get_task_arg_accessor(invocation);
+    return this->call_task_impl(invocation.task_id, accessor);
+  } else {
+    return std::nullopt;
+  }
+}
+
+void LocalTrainingBacking::execute_update(
+    layer_guid_t const &node, OptimizerAttrs const &optimizer_attrs) {
+  LayerAttrs layer_attrs = get_layer_attrs(this->computation_graph, node);
+  if (layer_attrs.attrs.has<WeightAttrs>()) {
+    // get tensors
+    tensor_guid_t weight_tensor = get_only(get_outgoing_tensors(this->computation_graph, node));
+    std::vector<optimizer_tensor_t> optimizer_buffer_tensors = this->layer_optimizer_tensor_ids.at(node);
+
+    // get invocation
+    TaskInvocation invocation = get_update_invocation(
+        optimizer_attrs, weight_tensor, optimizer_buffer_tensors);
+
+    // TODO: https://github.com/flexflow/flexflow-train/issues/1442
+    // assert(is_invocation_valid(get_update_signature(attrs), invocation));
+
+    // execute update
+    TaskArgumentAccessor accessor =
+        this->get_task_arg_accessor(invocation);
+    TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs);
+    update_impl_fn.get<GenericTaskImplFunction>().function_ptr(accessor);
+  }
+}
+
+TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor(
+    TaskInvocation const &invocation) const {
+  TensorSlotsBacking tensor_slots_backing =
+      this->local_tensor_backing.construct_tensor_slots_backing(
+          invocation.binding);
+  ArgSlotsBacking arg_slots_backing =
+      this->local_args_backing.construct_arg_slots_backing(invocation.binding);
+  return TaskArgumentAccessor::create<LocalTaskArgumentAccessor>(
+      this->allocator, tensor_slots_backing, arg_slots_backing);
+}
+
+TaskInvocation LocalTrainingBacking::lower_to_task_invocation(OpTaskInvocation const & op_task_invocation, layer_guid_t const & layer_guid) const {
+  TaskBinding binding;
+  // tensors
+  for (auto const & tensor_binding: op_task_invocation.binding.get_tensor_bindings()) {
+    tensor_guid_t tensor_to_bind = [&] {
+      switch (tensor_binding.second.role) {
+        case TensorRole::INPUT:
+          return get_incoming_inputs(this->computation_graph, layer_guid).at(tensor_binding.second.idx);
+        case TensorRole::OUTPUT:
+          return get_outgoing_tensors(this->computation_graph, layer_guid).at(tensor_binding.second.idx);
+        case TensorRole::WEIGHT:
+          return get_incoming_weights(this->computation_graph, layer_guid).at(tensor_binding.second.idx);
+        default:
+          throw mk_runtime_error(fmt::format("Invalid tensor role {}", tensor_binding.second.role));
+      }
+    }(); 
+
+    if (tensor_binding.first.is_grad == IsGrad::NO) {
+      binding.bind(tensor_binding.first.slot_id, tensor_to_bind);
+    } else if (tensor_binding.first.is_grad == IsGrad::YES) {
+      binding.bind_grad(tensor_binding.first.slot_id, tensor_to_bind);
+    } else {
+      throw mk_runtime_error(fmt::format("Invalid value for IsGrad {}", tensor_binding.first.is_grad));
+    }
+  }
+
+  // args
+  for (auto const & arg_binding: op_task_invocation.binding.get_arg_bindings()) {
+    if (arg_binding.second.has<OpArgRefSpec>()) {
+      ConcreteArgSpec concrete_arg = this->local_args_backing.lower_to_concrete_arg_spec(arg_binding.second.get<OpArgRefSpec>(), this->computation_graph, layer_guid);
+      binding.insert_arg_spec(arg_binding.first, TaskArgSpec{concrete_arg});
+    } else if (arg_binding.second.has<RuntimeArgRefSpec>()) {
+      binding.insert_arg_spec(arg_binding.first, TaskArgSpec{arg_binding.second.get<RuntimeArgRefSpec>()});
+    } else {
+      binding.insert_arg_spec(arg_binding.first, TaskArgSpec{arg_binding.second.get<ConcreteArgSpec>()});
+    }
+  }
+
+  return TaskInvocation{op_task_invocation.task_id, binding};
+}
+
+} // namespace FlexFlow

From 1c55cf7fa10de442ab1bbc0b6c11a04cebb76468 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 27 Jan 2025 22:31:38 -0800
Subject: [PATCH 29/91] Merge branch 'master' of
 github.com:flexflow/flexflow-train into realm-backend


From b9144ad0b3107d75011a602634b5cfa05fe58a69 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 30 Jan 2025 06:19:34 -0800
Subject: [PATCH 30/91] chore: update legion

---
 .flake/pkgs/legion.nix | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.flake/pkgs/legion.nix b/.flake/pkgs/legion.nix
index 814ef85e00..ebef22f4c5 100644
--- a/.flake/pkgs/legion.nix
+++ b/.flake/pkgs/legion.nix
@@ -18,13 +18,13 @@ in
 
 stdenv.mkDerivation rec {
   pname = "legion_flexflow";
-  version = "2024-03-13";
+  version = "2025-01-21";
 
   src = fetchFromGitLab {
     owner = "StanfordLegion";
     repo = "legion";
-    rev = "24e8c452341dea41427e0ce61e154d61715e6835";
-    sha256 = "sha256-NjCSjphOIew/V24i74I6DModSGcWKLeiSIjts3cFtx4=";
+    rev = "0c5a181e59c07e3af1091a2007378ff9355047fa";
+    sha256 = "sha256-oapo7klN17gmRsmaSsrpup4YJ0dtHxiKFtwz8jyPqzU=";
     fetchSubmodules = true;
   };
 
@@ -33,7 +33,7 @@ stdenv.mkDerivation rec {
   ];
 
   cmakeFlags = [
-    "-DLegion_USE_Python=1"
+    "-DLegion_USE_Python=0"
     "-DLegion_BUILD_BINDINGS=1"
     "-DLegion_USE_CUDA=1"
     "-DLegion_CUDA_ARCH=${lib.concatStringsSep "," cudaCapabilities}"

From 66647a26bccaecaa1a57852971d0b7a5735d32d4 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Thu, 30 Jan 2025 07:33:27 -0800
Subject: [PATCH 31/91] feat: add legion related code

---
 .../include/realm-backend/driver.h            | 11 +++
 .../include/realm-backend/task_wrapper.h      | 39 ++++++++++
 lib/realm-backend/src/driver.cc               | 24 ++++++
 lib/realm-backend/src/task_wrapper.cc         | 73 +++++++++++++++++++
 4 files changed, 147 insertions(+)
 create mode 100644 lib/realm-backend/include/realm-backend/driver.h
 create mode 100644 lib/realm-backend/include/realm-backend/task_wrapper.h
 create mode 100644 lib/realm-backend/src/driver.cc
 create mode 100644 lib/realm-backend/src/task_wrapper.cc

diff --git a/lib/realm-backend/include/realm-backend/driver.h b/lib/realm-backend/include/realm-backend/driver.h
new file mode 100644
index 0000000000..77272c36ad
--- /dev/null
+++ b/lib/realm-backend/include/realm-backend/driver.h
@@ -0,0 +1,11 @@
+#ifndef _FLEXFLOW_REALM_BACKEND_DRIVER_H
+#define _FLEXFLOW_REALM_BACKEND_DRIVER_H
+
+#include "realm.h"
+#include "realm/cmdline.h"
+#include "local-execution/task_invocation.h"
+
+void top_level_task(const void *args, size_t arglen, const void *userdata,
+               size_t userlen, Realm::Processor p);
+
+#endif
diff --git a/lib/realm-backend/include/realm-backend/task_wrapper.h b/lib/realm-backend/include/realm-backend/task_wrapper.h
new file mode 100644
index 0000000000..bf53ca7e93
--- /dev/null
+++ b/lib/realm-backend/include/realm-backend/task_wrapper.h
@@ -0,0 +1,39 @@
+#ifndef _FLEXFLOW_REALM_BACKEND_TASK_WRAPPER_H
+#define _FLEXFLOW_REALM_BACKEND_TASK_WRAPPER_H
+
+#include "local-execution/task_registry.h"
+#include "realm-backend/driver.h"
+#include "realm-backend/realm_task_argument_accessor.h"
+
+namespace FlexFlow {
+
+/* The following are general task wrappers to be invoked by the Realm runtime */
+
+struct RealmTaskArgs {
+  task_id_t task_id;
+  TaskImplFunction impl_function;
+  TaskArgumentAccessor accessor;
+  void *result;
+};
+
+void init_wrapper_task(const void *args, size_t arglen, const void *userdata,
+                       size_t userlen, Realm::Processor p);
+
+void fwdbwd_wrapper_task(const void *args, size_t arglen, const void *userdata,
+                         size_t userlen, Realm::Processor p);
+
+void generic_wrapper_task(const void *args, size_t arglen, const void *userdata,
+                          size_t userlen, Realm::Processor p);
+
+void register_wrapper_tasks_init(Realm::Processor p, task_id_t task_id);
+
+void register_wrapper_tasks_fwdbwd(Realm::Processor p, task_id_t task_id);
+
+void register_wrapper_tasks_generic(Realm::Processor p, task_id_t task_id);
+
+void register_wrapper_tasks(Realm::Processor p, task_id_t task_id,
+                            TaskSignatureAndImpl task_sig_impl);
+
+} // namespace FlexFlow
+
+#endif
\ No newline at end of file
diff --git a/lib/realm-backend/src/driver.cc b/lib/realm-backend/src/driver.cc
new file mode 100644
index 0000000000..8cfb038d97
--- /dev/null
+++ b/lib/realm-backend/src/driver.cc
@@ -0,0 +1,24 @@
+#include "realm-backend/driver.h"
+
+using namespace Realm;
+using namespace FlexFlow;
+
+Logger log_app("app");
+
+int main(int argc, const char **argv) {
+  Runtime rt;
+  rt.init(&argc, (char ***)&argv);
+
+  Processor::register_task_by_kind(Processor::LOC_PROC, false /*!global*/, 
+                                    static_cast<Processor::TaskFuncID>(task_id_t::TOP_LEVEL_TASK_ID),
+                                   CodeDescriptor(top_level_task),
+                                   ProfilingRequestSet())
+      .external_wait();
+
+  Processor p = Machine::ProcessorQuery(Machine::get_machine())
+                    .only_kind(Processor::LOC_PROC)
+                    .first();
+
+  rt.shutdown(rt.collective_spawn(p, static_cast<Processor::TaskFuncID>(task_id_t::TOP_LEVEL_TASK_ID), 0, 0));
+  return rt.wait_for_shutdown();
+}
diff --git a/lib/realm-backend/src/task_wrapper.cc b/lib/realm-backend/src/task_wrapper.cc
new file mode 100644
index 0000000000..7361a24cd9
--- /dev/null
+++ b/lib/realm-backend/src/task_wrapper.cc
@@ -0,0 +1,73 @@
+#include "realm-backend/task_wrapper.h"
+
+namespace FlexFlow {
+
+using namespace Realm;
+
+void init_wrapper_task(const void *args, size_t arglen, const void *userdata,
+                       size_t userlen, Processor p) {
+  RealmTaskArgs const &task_args =
+      *reinterpret_cast<const RealmTaskArgs *>(args);
+  auto fn =
+      RealmTaskArgs.impl_function.get<InitOpTaskImplFunction>().function_ptr;
+  *reinterpret_cast<DeviceSpecificDeviceStates *>(RealmTaskArgs.result) =
+      fn(RealmTaskArgs.acc);
+}
+
+void fwdbwd_wrapper_task(const void *args, size_t arglen, const void *userdata,
+                         size_t userlen, Processor p) {
+  RealmTaskArgs const &task_args =
+      *reinterpret_cast<const RealmTaskArgs *>(args);
+  auto fn =
+      RealmTaskArgs.impl_function.get<FwdBwdOpTaskImplFunction>().function_ptr;
+  *reinterpret_cast<std::optional<float> *>(RealmTaskArgs.result) =
+      fn(RealmTaskArgs.acc);
+}
+
+void generic_wrapper_task(const void *args, size_t arglen, const void *userdata,
+                          size_t userlen, Processor p) {
+  RealmTaskArgs const &task_args =
+      *reinterpret_cast<const RealmTaskArgs *>(args);
+  auto fn =
+      RealmTaskArgs.impl_function.get<GenericTaskImplFunction>().function_ptr;
+  fn(RealmTaskArgs.acc);
+}
+
+void register_wrapper_tasks_init(Processor p, task_id_t task_id) {
+  Processor::register_task_by_kind(
+      p.kind(), false /*!global*/, static_cast<Processor::TaskFuncID>(task_id),
+      CodeDescriptor(init_wrapper_task), ProfilingRequestSet())
+      .external_wait();
+}
+
+void register_wrapper_tasks_fwdbwd(Realm::Processor p, task_id_t task_id) {
+  Processor::register_task_by_kind(
+      p.kind(), false /*!global*/, static_cast<Processor::TaskFuncID>(task_id),
+      CodeDescriptor(fwdbwd_wrapper_task), ProfilingRequestSet())
+      .external_wait();
+}
+
+void register_wrapper_tasks_generic(Realm::Processor p, task_id_t task_id) {
+  Processor::register_task_by_kind(
+      p.kind(), false /*!global*/, static_cast<Processor::TaskFuncID>(task_id),
+      CodeDescriptor(generic_wrapper_task), ProfilingRequestSet())
+      .external_wait();
+}
+
+void register_wrapper_tasks(Processor p, task_id_t task_id,
+                            TaskSignatureAndImpl task_sig_impl) {
+  switch (task_sig_impl.task_signature.type) {
+  case OpTaskType::INIT:
+    register_wrapper_tasks_init(p, task_id);
+    break;
+  case OpTaskType::FWD:
+  case OpTaskType::BWD:
+    register_wrapper_tasks_fwdbwd(p, task_id);
+    break;
+  default:
+    register_wrapper_tasks_generic(p, task_id);
+    break;
+  }
+}
+
+} // namespace FlexFlow
\ No newline at end of file

From 0128abb7f0b76d9a985a5aded5195643a0ef0c1e Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Fri, 31 Jan 2025 18:26:33 -0800
Subject: [PATCH 32/91] Disaggregate local backend

---
 lib/kernels/src/array_shape.cc                |  18 +-
 .../local-execution/gradient_tensor_source.h  |  20 ++
 .../gradient_tensor_t.struct.toml             |  13 +
 .../local-execution/local_args_backing.h      |  32 +-
 .../local-execution/local_tensor_backing.h    |  95 ++++--
 .../local-execution/local_training_backing.h  |  64 ++--
 .../local-execution/loss_tensor_source.h      |  20 ++
 .../local-execution/lowered_tensor_source.h   |   3 +-
 .../local-execution/model_training_instance.h |  27 +-
 .../op_task_to_task_invocation.h              |  30 ++
 .../local-execution/optimizer_tensor_source.h |   3 +-
 .../local-execution/task_argument_accessor.h  |   3 +-
 .../include/local-execution/task_binding.h    |   8 +-
 .../include/local-execution/task_registry.h   |   7 +-
 .../tensor_type_t.variant.toml                |   5 +
 .../src/gradient_tensor_source.cc             |  14 +
 lib/local-execution/src/local_args_backing.cc |  67 ++---
 .../src/local_cost_estimator.cc               |   9 +-
 .../src/local_tensor_backing.cc               | 280 +++++++++++++-----
 .../src/local_training_backing.cc             | 238 +++++++--------
 lib/local-execution/src/loss_functions.cc     |   5 +-
 lib/local-execution/src/loss_tensor_source.cc |  13 +
 .../src/lowered_tensor_source.cc              |   5 +-
 .../src/model_training_instance.cc            |  74 +++--
 .../src/{local-execution => }/op_arg_spec.cc  |   0
 lib/local-execution/src/op_task_invocation.cc |  10 +-
 lib/local-execution/src/op_task_signature.cc  |  32 +-
 .../src/op_task_to_task_invocation.cc         | 108 +++++++
 .../src/optimizer_tensor_source.cc            |   5 +-
 lib/local-execution/src/task_binding.cc       |  52 ++--
 lib/local-execution/src/task_registry.cc      |  13 +-
 .../test/src/test_local_slots_backing.cc      | 118 ++++----
 .../test/src/test_local_task_arg_accessor.cc  |  45 +--
 lib/local-execution/test/src/test_loss_e2e.cc |  24 +-
 .../test/src/test_update_e2e.cc               |   2 +-
 .../include/op-attrs/operator_attrs.h         |   2 +-
 lib/pcg/include/pcg/computation_graph.h       |   2 +
 lib/pcg/include/pcg/optimizer_attrs.h         |   1 +
 lib/pcg/src/pcg/computation_graph.cc          |   5 +
 lib/pcg/src/pcg/optimizer_attrs.cc            |  13 +
 40 files changed, 938 insertions(+), 547 deletions(-)
 create mode 100644 lib/local-execution/include/local-execution/gradient_tensor_source.h
 create mode 100644 lib/local-execution/include/local-execution/gradient_tensor_t.struct.toml
 create mode 100644 lib/local-execution/include/local-execution/loss_tensor_source.h
 create mode 100644 lib/local-execution/include/local-execution/op_task_to_task_invocation.h
 create mode 100644 lib/local-execution/src/gradient_tensor_source.cc
 create mode 100644 lib/local-execution/src/loss_tensor_source.cc
 rename lib/local-execution/src/{local-execution => }/op_arg_spec.cc (100%)
 create mode 100644 lib/local-execution/src/op_task_to_task_invocation.cc

diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
index eb2b88b203..e8685048c6 100644
--- a/lib/kernels/src/array_shape.cc
+++ b/lib/kernels/src/array_shape.cc
@@ -57,15 +57,15 @@ std::size_t ArrayShape::at(ff_dim_t idx) const {
 
 ArrayShape ArrayShape::sub_shape(std::optional<ff_dim_t> start,
                                  std::optional<ff_dim_t> end) const {
-  std::optional<legion_dim_t> legion_start = transform(
-    start, [&](auto const &start_unwrapped) {
-      return legion_dim_from_ff_dim(start_unwrapped, num_dims());
-  });
-
-  std::optional<legion_dim_t> legion_end = transform(
-    end, [&](auto const &end_unwrapped) {
-      return legion_dim_from_ff_dim(end_unwrapped, num_dims());
-  });
+  std::optional<legion_dim_t> legion_start =
+      transform(start, [&](auto const &start_unwrapped) {
+        return legion_dim_from_ff_dim(start_unwrapped, num_dims());
+      });
+
+  std::optional<legion_dim_t> legion_end =
+      transform(end, [&](auto const &end_unwrapped) {
+        return legion_dim_from_ff_dim(end_unwrapped, num_dims());
+      });
   return this->sub_shape(legion_start, legion_end);
 }
 
diff --git a/lib/local-execution/include/local-execution/gradient_tensor_source.h b/lib/local-execution/include/local-execution/gradient_tensor_source.h
new file mode 100644
index 0000000000..bb7a4c7aa8
--- /dev/null
+++ b/lib/local-execution/include/local-execution/gradient_tensor_source.h
@@ -0,0 +1,20 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_GRADIENT_TENSOR_SOURCE_H
+#define _FLEXFLOW_LOCAL_EXECUTION_GRADIENT_TENSOR_SOURCE_H
+
+#include "local-execution/gradient_tensor_t.dtg.h"
+
+namespace FlexFlow {
+
+struct GradientTensorSource {
+public:
+  GradientTensorSource();
+
+  gradient_tensor_t new_gradient_tensor();
+
+private:
+  static size_t next_available_gradient_tensor_id;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/local-execution/include/local-execution/gradient_tensor_t.struct.toml b/lib/local-execution/include/local-execution/gradient_tensor_t.struct.toml
new file mode 100644
index 0000000000..5367ccee07
--- /dev/null
+++ b/lib/local-execution/include/local-execution/gradient_tensor_t.struct.toml
@@ -0,0 +1,13 @@
+namespace = "FlexFlow"
+name = "gradient_tensor_t"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "fmt",
+]
+
+
+[[fields]]
+name = "raw_index"
+type = "int"
diff --git a/lib/local-execution/include/local-execution/local_args_backing.h b/lib/local-execution/include/local-execution/local_args_backing.h
index d497c49738..6e6839fea7 100644
--- a/lib/local-execution/include/local-execution/local_args_backing.h
+++ b/lib/local-execution/include/local-execution/local_args_backing.h
@@ -1,30 +1,19 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ARGS_BACKING_H
 #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ARGS_BACKING_H
 
-#include "pcg/layer_guid_t.dtg.h"
-#include "pcg/computation_graph.h"
-#include "local-execution/per_device_op_state.h"
+#include "local-execution/local_task_argument_accessor.h"
 #include "local-execution/op_task_invocation.h"
+#include "local-execution/per_device_op_state.h"
 #include "local-execution/runtime_arg_config.h"
 #include "local-execution/task_invocation.dtg.h"
-#include "local-execution/local_task_argument_accessor.h"
+#include "pcg/computation_graph.h"
+#include "pcg/layer_guid_t.dtg.h"
 
 namespace FlexFlow {
 
 struct LocalArgsBacking {
   LocalArgsBacking(RuntimeArgConfig const &);
 
-public:
-  void add_per_device_op_state(layer_guid_t const &,
-                               DeviceSpecificDeviceStates const &);
-
-  ArgSlotsBacking construct_arg_slots_backing(TaskBinding const &) const;
-
-  ConcreteArgSpec lower_to_concrete_arg_spec(RuntimeArgRefSpec const &) const;
-  ConcreteArgSpec lower_to_concrete_arg_spec(OpArgRefSpec const &,
-                                             ComputationGraph const &,
-                                             layer_guid_t const &) const;
-
 public:
   // arguments
   std::unordered_map<layer_guid_t, DeviceSpecificDeviceStates>
@@ -32,6 +21,17 @@ struct LocalArgsBacking {
   RuntimeArgConfig runtime_arg_config;
 };
 
-}
+void add_per_device_op_state(LocalArgsBacking &,
+                             layer_guid_t const &,
+                             DeviceSpecificDeviceStates const &);
+
+std::optional<DeviceSpecificDeviceStates>
+    get_per_device_op_state_if_exists(LocalArgsBacking const &,
+                                      layer_guid_t const &);
+
+ArgSlotsBacking construct_arg_slots_backing(TaskBinding const &,
+                                            RuntimeArgConfig const &);
+
+} // namespace FlexFlow
 
 #endif
diff --git a/lib/local-execution/include/local-execution/local_tensor_backing.h b/lib/local-execution/include/local-execution/local_tensor_backing.h
index 68a38253f8..825ff0553e 100644
--- a/lib/local-execution/include/local-execution/local_tensor_backing.h
+++ b/lib/local-execution/include/local-execution/local_tensor_backing.h
@@ -3,16 +3,22 @@
 #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TENSOR_BACKING_H
 
 #include "kernels/accessor.h"
+#include "local-execution/gradient_tensor_source.h"
 #include "local-execution/local_task_argument_accessor.h"
-#include "local-execution/task_invocation.dtg.h"
-#include "local-execution/tensor_role.dtg.h"
-#include "local-execution/lowered_tensor_t.dtg.h"
+#include "local-execution/loss_tensor_source.h"
+#include "local-execution/loss_tensor_t.dtg.h"
 #include "local-execution/lowered_tensor_source.h"
+#include "local-execution/lowered_tensor_t.dtg.h"
+#include "local-execution/optimizer_tensor_source.h"
 #include "local-execution/optimizer_tensor_t.dtg.h"
-#include "local-execution/loss_tensor_t.dtg.h"
+#include "local-execution/task_invocation.dtg.h"
+#include "local-execution/tensor_role.dtg.h"
+#include "local-execution/tensor_type_t.dtg.h"
+#include "op-attrs/tensor_shape.dtg.h"
 #include "pcg/computation_graph.dtg.h"
-#include "pcg/tensor_guid_t.dtg.h"
 #include "pcg/layer_guid_t.dtg.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "pcg/tensor_guid_t.dtg.h"
 
 namespace FlexFlow {
 
@@ -20,39 +26,72 @@ using TensorBackingMap =
     std::unordered_map<lowered_tensor_t, GenericTensorAccessorW>;
 
 struct LocalTensorBacking {
-  LocalTensorBacking();
-
-public:
-  void allocate_layer_tensors(layer_guid_t const &,
-                              ComputationGraph const &,
-                              Allocator &);
-  void allocate_tensors_by_role(TensorRole const &,
-                                layer_guid_t const &,
-                                ComputationGraph const &,
-                                Allocator &);
-  void allocate_optimizer_tensors(tensor_guid_t const &,
-                                  std::vector<optimizer_tensor_t> const &,
-                                  Allocator &);
-  TensorSlotsBacking
-      construct_tensor_slots_backing(TaskBinding const &) const;
+  LocalTensorBacking() = default;
+  LocalTensorBacking(
+      std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> const
+          &allocated_tensor_backings,
+      std::unordered_set<tensor_guid_t> const &allocated_tensor_guids,
+      std::unordered_map<tensor_guid_t, gradient_tensor_t> const
+          &allocated_gradient_mapping,
+      std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>> const
+          &allocated_optimizer_mapping,
+      std::unordered_set<loss_tensor_t> const &allocated_loss_tensors);
 
-  GenericTensorAccessorW const &
-      get_tensor_backing(lowered_tensor_t const &) const;
-
-  bool is_tensor_allocated(lowered_tensor_t const &) const;
+  lowered_tensor_t allocate_tensor(TensorShape const &, Allocator &);
 
 public:
   // tensors
   TensorBackingMap tensor_backings;
-  
+
   std::unordered_map<tensor_guid_t, lowered_tensor_t> tensor_lowering_mapping;
-  std::unordered_map<tensor_guid_t, lowered_tensor_t> gradient_tensor_lowering_mapping;
-  std::unordered_map<optimizer_tensor_t, lowered_tensor_t> optimizer_tensor_lowering_mapping;
-  std::unordered_map<loss_tensor_t, lowered_tensor_t> loss_tensor_lowering_mapping;
+  std::unordered_map<gradient_tensor_t, lowered_tensor_t>
+      gradient_tensor_lowering_mapping;
+  std::unordered_map<optimizer_tensor_t, lowered_tensor_t>
+      optimizer_tensor_lowering_mapping;
+  std::unordered_map<loss_tensor_t, lowered_tensor_t>
+      loss_tensor_lowering_mapping;
+
+  std::unordered_map<tensor_guid_t, gradient_tensor_t> tensor_gradient_mapping;
+  std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
+      tensor_optimizer_mapping;
 
+private:
+  lowered_tensor_t insert_tensor(GenericTensorAccessorW const &);
   LoweredTensorSource lowered_tensor_source;
 };
 
+void allocate_tensor_guid(LocalTensorBacking &,
+                          tensor_guid_t const &,
+                          TensorShape const &,
+                          Allocator &);
+void allocate_gradient_tensor(LocalTensorBacking &,
+                              gradient_tensor_t const &,
+                              tensor_guid_t const &,
+                              TensorShape const &,
+                              Allocator &);
+void allocate_optimizer_tensors(LocalTensorBacking &,
+                                std::vector<optimizer_tensor_t> const &,
+                                tensor_guid_t const &,
+                                TensorShape const &,
+                                Allocator &);
+
+void allocate_all_computation_graph_tensors(LocalTensorBacking &,
+                                            GradientTensorSource &,
+                                            ComputationGraph const &,
+                                            Allocator &);
+void allocate_all_optimizer_tensors(LocalTensorBacking &,
+                                    OptimizerTensorSource &,
+                                    ComputationGraph const &,
+                                    Allocator &,
+                                    OptimizerAttrs const &);
+loss_tensor_t allocate_loss_tensor(LocalTensorBacking &,
+                                   LossTensorSource const &,
+                                   TensorShape const &,
+                                   Allocator &);
+
+TensorSlotsBacking construct_tensor_slots_backing(LocalTensorBacking const &,
+                                                  TaskBinding const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h
index a915f3e420..b712be9a93 100644
--- a/lib/local-execution/include/local-execution/local_training_backing.h
+++ b/lib/local-execution/include/local-execution/local_training_backing.h
@@ -1,58 +1,60 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H
 #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H
 
-#include "local-execution/local_tensor_backing.h"
 #include "local-execution/local_args_backing.h"
+#include "local-execution/local_tensor_backing.h"
+#include "local-execution/optimizer_tensor_source.h"
 #include "local-execution/task_registry.h"
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
 #include "pcg/computation_graph.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
-#include "local-execution/optimizer_tensor_source.h"
 
 namespace FlexFlow {
 
-using PerLayerElapsedTime =
-    std::unordered_map<layer_guid_t, std::optional<float>>;
-
 struct LocalTrainingBacking {
   LocalTrainingBacking(Allocator const &,
                        ComputationGraph const &,
-                       RuntimeArgConfig const &);
-  void register_and_allocate_layer(layer_guid_t const &);
-  void allocate_layer_optimizer_tensors(layer_guid_t const &,
-                                        OptimizerAttrs const &);
-
-  void execute_init(layer_guid_t const &);
-  std::optional<float> execute_forward(layer_guid_t const &);
-  void compute_loss(LossAttrs const &loss_attrs,
-                    tensor_guid_t const &logit_tensor,
-                    loss_tensor_t const &label_tensor);
-  std::optional<float> execute_backward(layer_guid_t const &);
-  void execute_update(layer_guid_t const &, OptimizerAttrs const &);
-
-  TaskArgumentAccessor
-      get_task_arg_accessor(TaskInvocation const &) const;
-
-  TaskInvocation lower_to_task_invocation(OpTaskInvocation const &, layer_guid_t const &) const;
+                       LocalTensorBacking const &,
+                       LocalArgsBacking const &);
 
+public:
   LocalTensorBacking local_tensor_backing;
   LocalArgsBacking local_args_backing;
 
-private:
-  DeviceSpecificDeviceStates call_init_task_impl(task_id_t,
-                                                 TaskArgumentAccessor const &);
-  std::optional<float> call_task_impl(task_id_t, TaskArgumentAccessor);
-
-private:
   Allocator allocator;
   ComputationGraph computation_graph;
   TaskRegistry task_registry;
 
-  // optimizer
-  OptimizerTensorSource optimizer_tensor_source;
-  std::unordered_map<layer_guid_t, std::vector<optimizer_tensor_t>> layer_optimizer_tensor_ids;
+private:
+  GradientTensorSource gradient_tensor_source;
 };
 
+DeviceSpecificDeviceStates call_init_task_impl(TaskRegistry const &,
+                                               task_id_t task_id,
+                                               TaskArgumentAccessor const &acc);
+
+std::optional<float> call_task_impl(TaskRegistry const &,
+                                    task_id_t task_id,
+                                    TaskArgumentAccessor acc);
+
+void execute_init(LocalTrainingBacking &, layer_guid_t const &);
+std::optional<float> execute_forward(LocalTrainingBacking &,
+                                     layer_guid_t const &);
+std::optional<float> execute_backward(LocalTrainingBacking &,
+                                      layer_guid_t const &);
+void compute_loss(LocalTrainingBacking const &,
+                  LossAttrs const &,
+                  tensor_guid_t const &logit_tensor,
+                  loss_tensor_t const &label_tensor);
+void execute_update(LocalTrainingBacking &,
+                    layer_guid_t const &,
+                    OptimizerAttrs const &);
+
+TaskArgumentAccessor get_task_arg_accessor(LocalTensorBacking const &,
+                                           LocalArgsBacking const &,
+                                           TaskInvocation const &,
+                                           Allocator &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/local-execution/include/local-execution/loss_tensor_source.h b/lib/local-execution/include/local-execution/loss_tensor_source.h
new file mode 100644
index 0000000000..2b55f1af01
--- /dev/null
+++ b/lib/local-execution/include/local-execution/loss_tensor_source.h
@@ -0,0 +1,20 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_LOSS_TENSOR_SOURCE_H
+#define _FLEXFLOW_LOCAL_EXECUTION_LOSS_TENSOR_SOURCE_H
+
+#include "local-execution/loss_tensor_t.dtg.h"
+
+namespace FlexFlow {
+
+struct LossTensorSource {
+public:
+  LossTensorSource();
+
+  loss_tensor_t new_loss_tensor();
+
+private:
+  static size_t next_available_loss_tensor_id;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/local-execution/include/local-execution/lowered_tensor_source.h b/lib/local-execution/include/local-execution/lowered_tensor_source.h
index 63cc2cd31e..e4fc4ff56c 100644
--- a/lib/local-execution/include/local-execution/lowered_tensor_source.h
+++ b/lib/local-execution/include/local-execution/lowered_tensor_source.h
@@ -14,8 +14,7 @@ struct LoweredTensorSource {
 private:
   static size_t next_available_lowered_tensor_id;
 };
-  
-} // namespace FlexFlow
 
+} // namespace FlexFlow
 
 #endif
diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h
index dd6a6f33d7..81aacf2a53 100644
--- a/lib/local-execution/include/local-execution/model_training_instance.h
+++ b/lib/local-execution/include/local-execution/model_training_instance.h
@@ -2,9 +2,9 @@
 #define _FLEXFLOW_LOCAL_EXECUTION_MODEL_TRAINING_INSTANCE_H
 
 #include "local-execution/local_training_backing.h"
+#include "local-execution/loss_tensor_t.dtg.h"
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
 #include "pcg/tensor_guid_t.dtg.h"
-#include "local-execution/loss_tensor_t.dtg.h"
 
 namespace FlexFlow {
 
@@ -12,27 +12,28 @@ using PerLayerElapsedTime =
     std::unordered_map<layer_guid_t, std::optional<float>>;
 
 struct ModelTrainingInstance {
-  ModelTrainingInstance(Allocator const &,
-                        ComputationGraph const &,
-                        RuntimeArgConfig const &,
+  ModelTrainingInstance(LocalTrainingBacking const &,
+                        tensor_guid_t const & logit_tensor,
+                        TensorShape const & label_tensor_shape,
                         LossAttrs const &,
-                        tensor_guid_t const &logit_tensor,
-                        loss_tensor_t const &label_tensor,
                         OptimizerAttrs const &);
 
-  void execute_init();
-  PerLayerElapsedTime execute_forward();
-  PerLayerElapsedTime execute_backward();
-  void execute_update();
-
-  ComputationGraph computation_graph;
   LocalTrainingBacking training_backing;
   LossAttrs loss_attrs;
+  OptimizerAttrs optimizer_attrs;
   tensor_guid_t logit_tensor;
   loss_tensor_t label_tensor;
-  OptimizerAttrs optimizer_attrs;
+
+private:
+  OptimizerTensorSource optimizer_tensor_source;
+  LossTensorSource loss_tensor_source;
 };
 
+void init(ModelTrainingInstance &);
+PerLayerElapsedTime forward(ModelTrainingInstance &);
+PerLayerElapsedTime backward(ModelTrainingInstance &);
+void update(ModelTrainingInstance &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/local-execution/include/local-execution/op_task_to_task_invocation.h b/lib/local-execution/include/local-execution/op_task_to_task_invocation.h
new file mode 100644
index 0000000000..44e10d4b51
--- /dev/null
+++ b/lib/local-execution/include/local-execution/op_task_to_task_invocation.h
@@ -0,0 +1,30 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_TO_TASK_INVOCATION_H
+#define _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_TO_TASK_INVOCATION_H
+
+#include "local-execution/device_specific_device_states.dtg.h"
+#include "local-execution/op_task_invocation.h"
+#include "local-execution/runtime_arg_config.h"
+#include "local-execution/task_invocation.dtg.h"
+#include "pcg/computation_graph.dtg.h"
+#include "pcg/layer_guid_t.dtg.h"
+
+namespace FlexFlow {
+
+TaskInvocation
+    lower_to_task_invocation(OpTaskInvocation const &,
+                             layer_guid_t const &,
+                             ComputationGraph const &,
+                             std::optional<DeviceSpecificDeviceStates> const &);
+
+ConcreteArgSpec lower_to_concrete_arg_spec(RuntimeArgRefSpec const &,
+                                           RuntimeArgConfig const &);
+
+ConcreteArgSpec lower_to_concrete_arg_spec(
+    OpArgRefSpec const &,
+    ComputationGraph const &,
+    layer_guid_t const &,
+    std::optional<DeviceSpecificDeviceStates> const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/local-execution/include/local-execution/optimizer_tensor_source.h b/lib/local-execution/include/local-execution/optimizer_tensor_source.h
index fc5015b299..658c545225 100644
--- a/lib/local-execution/include/local-execution/optimizer_tensor_source.h
+++ b/lib/local-execution/include/local-execution/optimizer_tensor_source.h
@@ -14,8 +14,7 @@ struct OptimizerTensorSource {
 private:
   static size_t next_available_optimizer_tensor_id;
 };
-  
-} // namespace FlexFlow
 
+} // namespace FlexFlow
 
 #endif
diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h
index 8b8516045d..16a63a789b 100644
--- a/lib/local-execution/include/local-execution/task_argument_accessor.h
+++ b/lib/local-execution/include/local-execution/task_argument_accessor.h
@@ -65,7 +65,8 @@ struct TaskArgumentAccessor {
   // }
 
   // template <Permissions PRIV>
-  // privilege_mode_to_accessor<PRIV> get_non_graph_tensor(slot_id_t slot) const {
+  // privilege_mode_to_accessor<PRIV> get_non_graph_tensor(slot_id_t slot) const
+  // {
   //   return std::get<privilege_mode_to_accessor<PRIV>>(
   //       this->ptr->get_tensor(slot, PRIV, TensorType::NON_GRAPH));
   // }
diff --git a/lib/local-execution/include/local-execution/task_binding.h b/lib/local-execution/include/local-execution/task_binding.h
index 33636616b3..21fc813a6b 100644
--- a/lib/local-execution/include/local-execution/task_binding.h
+++ b/lib/local-execution/include/local-execution/task_binding.h
@@ -1,14 +1,14 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H
 #define _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H
 
+#include "local-execution/loss_tensor_t.dtg.h"
 #include "local-execution/lowered_tensor_t.dtg.h"
+#include "local-execution/optimizer_tensor_t.dtg.h"
 #include "local-execution/slot_id_t.dtg.h"
 #include "local-execution/slot_tensor_type_id.dtg.h"
 #include "local-execution/task_arg_spec.dtg.h"
 #include "local-execution/task_id_t.dtg.h"
 #include "local-execution/task_signature.dtg.h"
-#include "local-execution/optimizer_tensor_t.dtg.h"
-#include "local-execution/loss_tensor_t.dtg.h"
 #include "local-execution/tensor_type_t.dtg.h"
 
 namespace FlexFlow {
@@ -17,10 +17,10 @@ struct TaskBinding {
   TaskBinding() = default;
 
   void bind(int, tensor_guid_t const &);
-  void bind(slot_id_t,  tensor_guid_t const &);
+  void bind(slot_id_t, tensor_guid_t const &);
 
   void bind_grad(int, tensor_guid_t const &);
-  void bind_grad(slot_id_t,  tensor_guid_t const &);
+  void bind_grad(slot_id_t, tensor_guid_t const &);
 
   void bind(int, optimizer_tensor_t const &);
   void bind(slot_id_t, optimizer_tensor_t const &);
diff --git a/lib/local-execution/include/local-execution/task_registry.h b/lib/local-execution/include/local-execution/task_registry.h
index fa3e558337..1669822c83 100644
--- a/lib/local-execution/include/local-execution/task_registry.h
+++ b/lib/local-execution/include/local-execution/task_registry.h
@@ -15,8 +15,11 @@ void register_tasks_for_layer(TaskRegistry &,
                               ComputationGraphOpAttrs const &attrs);
 
 bool registry_contains_task_for_layer(TaskRegistry const &,
-                               layer_guid_t const &,
-                               OpTaskType const &);
+                                      layer_guid_t const &,
+                                      OpTaskType const &);
+
+void register_all_computation_graph_tasks(TaskRegistry &,
+                                          ComputationGraph const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/local-execution/include/local-execution/tensor_type_t.variant.toml b/lib/local-execution/include/local-execution/tensor_type_t.variant.toml
index d4e525c348..cd3520ee5d 100644
--- a/lib/local-execution/include/local-execution/tensor_type_t.variant.toml
+++ b/lib/local-execution/include/local-execution/tensor_type_t.variant.toml
@@ -10,6 +10,7 @@ features = [
 includes = [
   "pcg/tensor_guid_t.dtg.h",
   "local-execution/optimizer_tensor_t.dtg.h",
+  "local-execution/gradient_tensor_t.dtg.h",
   "local-execution/loss_tensor_t.dtg.h"
 ]
 
@@ -17,6 +18,10 @@ includes = [
 type = "::FlexFlow::tensor_guid_t"
 key = "tensor_guid"
 
+[[values]]
+type = "::FlexFlow::gradient_tensor_t"
+key = "gradient_tensor"
+
 [[values]]
 type = "::FlexFlow::optimizer_tensor_t"
 key = "optimizer_tensor"
diff --git a/lib/local-execution/src/gradient_tensor_source.cc b/lib/local-execution/src/gradient_tensor_source.cc
new file mode 100644
index 0000000000..28cec16ef9
--- /dev/null
+++ b/lib/local-execution/src/gradient_tensor_source.cc
@@ -0,0 +1,14 @@
+#include "local-execution/gradient_tensor_source.h"
+
+namespace FlexFlow {
+
+size_t GradientTensorSource::next_available_gradient_tensor_id = 0;
+
+GradientTensorSource::GradientTensorSource() {}
+
+gradient_tensor_t GradientTensorSource::new_gradient_tensor() {
+  return gradient_tensor_t{
+      GradientTensorSource::next_available_gradient_tensor_id++};
+}
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/local_args_backing.cc b/lib/local-execution/src/local_args_backing.cc
index 0c3cfe70e8..d8a94fb2c5 100644
--- a/lib/local-execution/src/local_args_backing.cc
+++ b/lib/local-execution/src/local_args_backing.cc
@@ -1,62 +1,43 @@
 #include "local-execution/local_args_backing.h"
-#include "utils/containers/map_values.h"
+#include "local-execution/op_task_to_task_invocation.h"
+#include "op-attrs/parallel_tensor_shape.h"
 #include "utils/containers/contains_key.h"
+#include "utils/containers/map_values.h"
 #include "utils/overload.h"
-#include "op-attrs/parallel_tensor_shape.h"
 
 namespace FlexFlow {
 
+LocalArgsBacking::LocalArgsBacking(RuntimeArgConfig const &runtime_arg_config)
+    : runtime_arg_config(runtime_arg_config){};
 
-void LocalArgsBacking::add_per_device_op_state(
-  layer_guid_t const &op_guid,
-    DeviceSpecificDeviceStates const &device_state) {
-  this->per_device_op_states.insert({op_guid, device_state});
+void add_per_device_op_state(LocalArgsBacking &local_args_backing,
+                             layer_guid_t const &op_guid,
+                             DeviceSpecificDeviceStates const &device_state) {
+  local_args_backing.per_device_op_states.insert({op_guid, device_state});
 }
 
-ArgSlotsBacking LocalArgsBacking::construct_arg_slots_backing(
-    TaskBinding const &binding) const {
+std::optional<DeviceSpecificDeviceStates> get_per_device_op_state_if_exists(
+    LocalArgsBacking const &local_args_backing,
+    layer_guid_t const &layer_guid) {
+  if (contains_key(local_args_backing.per_device_op_states, layer_guid)) {
+    return local_args_backing.per_device_op_states.at(layer_guid);
+  } else {
+    return std::nullopt;
+  }
+}
+
+ArgSlotsBacking
+    construct_arg_slots_backing(TaskBinding const &binding,
+                                RuntimeArgConfig const &runtime_arg_config) {
   return map_values(
       binding.get_arg_bindings(), [&](TaskArgSpec const &arg_binding) {
         return arg_binding.template visit<ConcreteArgSpec>(
             overload{[&](RuntimeArgRefSpec const &s) {
-                       return this->lower_to_concrete_arg_spec(s);
+                       return lower_to_concrete_arg_spec(s, runtime_arg_config);
                      },
                      [](ConcreteArgSpec const &s) { return s; }});
       });
   ;
 }
 
-ConcreteArgSpec LocalArgsBacking::lower_to_concrete_arg_spec(
-    OpArgRefSpec const &op_arg_ref_spec, ComputationGraph const & cg, layer_guid_t const &op_guid) const {
-  if (op_arg_ref_spec.holds<DeviceSpecificDeviceStates>()) {
-    assert(contains_key(this->per_device_op_states, op_guid));
-    DeviceSpecificDeviceStates device_specific =
-        per_device_op_states.at(op_guid);
-    PerDeviceOpState device_state =
-        get_device_state_from_device_specific(device_specific, 0);
-    return ConcreteArgSpec::create(device_state);
-  } else if (op_arg_ref_spec.holds<ParallelTensorShape>()) {
-    ParallelTensorShapeRefType index_op_arg_ref =
-        op_arg_ref_spec.get_ref_type().get<ParallelTensorShapeRefType>();
-    tensor_guid_t input_tensor = get_incoming_inputs(cg, op_guid).at(index_op_arg_ref.idx);
-    TensorAttrs tensor_attrs = get_tensor_attrs(cg, input_tensor);
-    ParallelTensorShape shape = lift_to_parallel(tensor_attrs.shape);
-    return ConcreteArgSpec::create(shape);
-  } else {
-    throw mk_runtime_error("Unhandled op arg ref type");
-  }
-}
-
-ConcreteArgSpec LocalArgsBacking::lower_to_concrete_arg_spec(
-    RuntimeArgRefSpec const &runtime_arg_ref_spec) const {
-  if (runtime_arg_ref_spec.holds<DeviceSpecific<PerDeviceFFHandle>>()) {
-    return ConcreteArgSpec::create(
-        *(this->runtime_arg_config.ff_handle.get(0)));
-  } else if (runtime_arg_ref_spec.holds<ProfilingSettings>()) {
-    return ConcreteArgSpec::create(this->runtime_arg_config.profiling_settings);
-  } else {
-    throw mk_runtime_error("Unhandled runtime arg ref type");
-  }
-}
-
-}
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
index b416378e66..b959f31a8b 100644
--- a/lib/local-execution/src/local_cost_estimator.cc
+++ b/lib/local-execution/src/local_cost_estimator.cc
@@ -1,16 +1,16 @@
 #include "local-execution/local_cost_estimator.h"
-#include "local-execution/tensor_lowering.h"
 #include "kernels/device.h"
 #include "kernels/local_cuda_allocator.h"
+#include "local-execution/tensor_lowering.h"
 #include "local-execution/tracked_allocator.h"
 #include "op-attrs/computation_graph_op_attrs.h"
 #include "op-attrs/pcg_operator_attrs.h"
 #include "pcg/computation_graph/layer_added_result.dtg.h"
 #include "pcg/computation_graph_builder.h"
 #include "pcg/parallel_tensor_attrs.h"
+#include "utils/containers/sum.h"
 #include "utils/containers/transform.h"
 #include "utils/containers/values.h"
-#include "utils/containers/sum.h"
 
 namespace FlexFlow {
 
@@ -64,9 +64,8 @@ CostDetails LocalCostEstimator::estimate_cost(
                     }),
           get_vector_piece_attrs(outputs));
 
-  LocalTrainingBacking local_backing(allocator,
-                                     cg_builder.computation_graph,
-                                     this->runtime_arg_config);
+  LocalTrainingBacking local_backing(
+      allocator, cg_builder.computation_graph, this->runtime_arg_config);
   local_backing.register_and_allocate_layer(layer_added_result.layer);
   local_backing.execute_init(layer_added_result.layer);
   float fwd = local_backing.execute_forward(layer_added_result.layer).value();
diff --git a/lib/local-execution/src/local_tensor_backing.cc b/lib/local-execution/src/local_tensor_backing.cc
index 9da74c27b9..de058d88ad 100644
--- a/lib/local-execution/src/local_tensor_backing.cc
+++ b/lib/local-execution/src/local_tensor_backing.cc
@@ -1,119 +1,235 @@
 #include "local-execution/local_tensor_backing.h"
+#include "local-execution/slot_grad_id.dtg.h"
 #include "local-execution/tensor_lowering.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "pcg/computation_graph.h"
+#include "pcg/optimizer_attrs.h"
 #include "utils/containers/contains_key.h"
+#include "utils/containers/keys.h"
 #include "utils/overload.h"
-#include "local-execution/slot_grad_id.dtg.h"
 
 namespace FlexFlow {
 
-LocalTensorBacking::LocalTensorBacking() {};
+LocalTensorBacking::LocalTensorBacking(
+    std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> const
+        &allocated_tensor_backings,
+    std::unordered_set<tensor_guid_t> const &allocated_tensor_guids,
+    std::unordered_map<tensor_guid_t, gradient_tensor_t> const
+        &allocated_gradient_mapping,
+    std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>> const
+        &allocated_optimizer_mapping,
+    std::unordered_set<loss_tensor_t> const &allocated_loss_tensors)
+    : tensor_gradient_mapping(allocated_gradient_mapping),
+      tensor_optimizer_mapping(allocated_optimizer_mapping) {
 
-void LocalTensorBacking::allocate_layer_tensors(
-    layer_guid_t const &layer_guid,
-    ComputationGraph const &computation_graph,
-    Allocator &allocator) {
-  this->allocate_tensors_by_role(
-      TensorRole::INPUT, layer_guid, computation_graph, allocator);
-  this->allocate_tensors_by_role(
-      TensorRole::WEIGHT, layer_guid, computation_graph, allocator);
-  this->allocate_tensors_by_role(
-      TensorRole::OUTPUT, layer_guid, computation_graph, allocator);
-}
+  // computation graph tensors
+  for (tensor_guid_t const &allocated_tensor_guid : allocated_tensor_guids) {
+    lowered_tensor_t lowered_tensor = this->insert_tensor(
+        allocated_tensor_backings.at(TensorTypeVariant{allocated_tensor_guid}));
+    this->tensor_lowering_mapping.insert(
+        {allocated_tensor_guid, lowered_tensor});
+  }
 
-void LocalTensorBacking::allocate_tensors_by_role(
-    TensorRole const &role,
-    layer_guid_t const &layer_guid,
-    ComputationGraph const &computation_graph,
-    Allocator &allocator) {
-  std::vector<tensor_guid_t> tensors;
-  switch (role) {
-    case TensorRole::INPUT:
-      tensors = get_incoming_inputs(computation_graph, layer_guid);
-      break;
-    case TensorRole::WEIGHT:
-      tensors = get_incoming_weights(computation_graph, layer_guid);
-      break;
-    case TensorRole::OUTPUT:
-      tensors = get_outgoing_tensors(computation_graph, layer_guid);
-      break;
-    default:
-      throw mk_runtime_error("Invalid tensor role, got {}", role);
+  // gradient tensors
+  for (std::pair<tensor_guid_t, gradient_tensor_t> const
+           &tensor_guid_gradient_pair : allocated_gradient_mapping) {
+    gradient_tensor_t allocated_gradient_tensor =
+        tensor_guid_gradient_pair.second;
+    lowered_tensor_t lowered_tensor =
+        this->insert_tensor(allocated_tensor_backings.at(
+            TensorTypeVariant{allocated_gradient_tensor}));
+    this->gradient_tensor_lowering_mapping.insert(
+        {allocated_gradient_tensor, lowered_tensor});
   }
 
-  for (tensor_guid_t const &tensor : tensors) {
-    TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor);
-    // tensor allocation
-    if (!contains_key(this->tensor_lowering_mapping, tensor)) {
-      lowered_tensor_t reduced_tensor = this->lowered_tensor_source.new_lowered_tensor();
-      this->tensor_lowering_mapping.insert({tensor, reduced_tensor});
-      GenericTensorAccessorW tensor_backing =
-          allocator.allocate_tensor(tensor_attrs.shape);
-      this->tensor_backings.insert({reduced_tensor, tensor_backing});
+  // optimizer tensors
+  for (std::pair<tensor_guid_t, std::vector<optimizer_tensor_t>> const
+           &tensor_guid_optimizers_pair : allocated_optimizer_mapping) {
+    for (optimizer_tensor_t const &allocated_optimizer_tensor :
+         tensor_guid_optimizers_pair.second) {
+      lowered_tensor_t lowered_tensor =
+          this->insert_tensor(allocated_tensor_backings.at(
+              TensorTypeVariant{allocated_optimizer_tensor}));
+      this->optimizer_tensor_lowering_mapping.insert(
+          {allocated_optimizer_tensor, lowered_tensor});
     }
+  }
+
+  // loss tensors
+  for (loss_tensor_t const &allocated_loss_tensor : allocated_loss_tensors) {
+    lowered_tensor_t lowered_tensor = this->insert_tensor(
+        allocated_tensor_backings.at(TensorTypeVariant{allocated_loss_tensor}));
+    this->loss_tensor_lowering_mapping.insert(
+        {allocated_loss_tensor, lowered_tensor});
+  }
+
+  // sanity check that backings match up with the mappings
+  assert(this->tensor_backings.size() == allocated_tensor_backings.size());
+};
+
+lowered_tensor_t LocalTensorBacking::insert_tensor(
+    GenericTensorAccessorW const &tensor_backing) {
+  lowered_tensor_t lowered_tensor =
+      this->lowered_tensor_source.new_lowered_tensor();
+  this->tensor_backings.insert({lowered_tensor, tensor_backing});
+  return lowered_tensor;
+}
+
+lowered_tensor_t
+    LocalTensorBacking::allocate_tensor(TensorShape const &tensor_shape,
+                                        Allocator &allocator) {
+  GenericTensorAccessorW tensor_backing =
+      allocator.allocate_tensor(tensor_shape);
+  return this->insert_tensor(tensor_backing);
+}
 
-    // gradient tensor allocation
-    if (tensor_attrs.create_gradients == CreateGrad::YES && !contains_key(this->gradient_tensor_lowering_mapping, tensor)) {
-      lowered_tensor_t reduced_tensor = this->lowered_tensor_source.new_lowered_tensor();
-      this->gradient_tensor_lowering_mapping.insert({tensor, reduced_tensor});
-      GenericTensorAccessorW gradient_tensor_backing =
-          allocator.allocate_tensor(tensor_attrs.shape);
-      this->tensor_backings.insert(
-          {reduced_tensor, gradient_tensor_backing});
+void allocate_tensor_guid(LocalTensorBacking &local_tensor_backing,
+                          tensor_guid_t const &tensor_guid,
+                          TensorShape const &tensor_shape,
+                          Allocator &allocator) {
+  if (!contains_key(local_tensor_backing.tensor_lowering_mapping,
+                    tensor_guid)) {
+    lowered_tensor_t lowered_tensor =
+        local_tensor_backing.allocate_tensor(tensor_shape, allocator);
+    local_tensor_backing.tensor_lowering_mapping.insert(
+        {tensor_guid, lowered_tensor});
+  }
+}
+
+void allocate_gradient_tensor(LocalTensorBacking &local_tensor_backing,
+                              gradient_tensor_t const &gradient_tensor,
+                              tensor_guid_t const &tensor_guid,
+                              TensorShape const &tensor_shape,
+                              Allocator &allocator) {
+  if (!contains_key(local_tensor_backing.tensor_gradient_mapping,
+                    tensor_guid)) {
+    local_tensor_backing.tensor_gradient_mapping.insert(
+        {tensor_guid, gradient_tensor});
+    lowered_tensor_t lowered_tensor =
+        local_tensor_backing.allocate_tensor(tensor_shape, allocator);
+    local_tensor_backing.gradient_tensor_lowering_mapping.insert(
+        {gradient_tensor, lowered_tensor});
+  }
+}
+
+void allocate_optimizer_tensors(
+    LocalTensorBacking &local_tensor_backing,
+    std::vector<optimizer_tensor_t> const &optimizer_tensors,
+    tensor_guid_t const &tensor_guid,
+    TensorShape const &tensor_shape,
+    Allocator &allocator) {
+  if (!contains_key(local_tensor_backing.tensor_optimizer_mapping,
+                    tensor_guid)) {
+    // insert new optimizer tensors into mappings
+    std::vector<optimizer_tensor_t> optimizer_tensors;
+    for (optimizer_tensor_t const &optimizer_tensor : optimizer_tensors) {
+      // allocate lowered tensor
+      lowered_tensor_t lowered_tensor =
+          local_tensor_backing.allocate_tensor(tensor_shape, allocator);
+      local_tensor_backing.optimizer_tensor_lowering_mapping.insert(
+          {optimizer_tensor, lowered_tensor});
     }
+    local_tensor_backing.tensor_optimizer_mapping.insert(
+        {tensor_guid, optimizer_tensors});
   }
 }
 
-void LocalTensorBacking::allocate_optimizer_tensors(
-    tensor_guid_t const &weight,
-    std::vector<optimizer_tensor_t> const& optimizer_tensors,
+void allocate_loss_tensor(LocalTensorBacking &local_tensor_backing,
+                          loss_tensor_t const &loss_tensor,
+                          TensorShape const &tensor_shape,
+                          Allocator &allocator) {
+  lowered_tensor_t lowered_tensor =
+      local_tensor_backing.allocate_tensor(tensor_shape, allocator);
+  local_tensor_backing.loss_tensor_lowering_mapping.insert(
+      {loss_tensor, lowered_tensor});
+}
+
+void allocate_all_computation_graph_tensors(
+    LocalTensorBacking &local_tensor_backing,
+    GradientTensorSource &gradient_tensor_source,
+    ComputationGraph const &computation_graph,
     Allocator &allocator) {
-  GenericTensorAccessorW weight_backing = this->get_tensor_backing(this->tensor_lowering_mapping.at(weight));
-  for (optimizer_tensor_t const & optimizer_tensor: optimizer_tensors) {
-    // optimizer tensor allocation
-    if (!contains_key(this->optimizer_tensor_lowering_mapping, optimizer_tensor)) {
-      lowered_tensor_t buffer_tensor = this->lowered_tensor_source.new_lowered_tensor();
-      this->optimizer_tensor_lowering_mapping.insert({optimizer_tensor, buffer_tensor});
-      GenericTensorAccessorW buffer_backing = allocator.allocate_tensor(
-          get_tensor_shape(weight_backing.shape, weight_backing.data_type));
-      this->tensor_backings.insert({buffer_tensor, buffer_backing});
+  // allocate each layer's tensors and gradient tensors
+  for (tensor_guid_t const &tensor_guid : get_all_tensors(computation_graph)) {
+    TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor_guid);
+    allocate_tensor_guid(
+        local_tensor_backing, tensor_guid, tensor_attrs.shape, allocator);
+
+    if (tensor_attrs.create_gradients == CreateGrad::YES) {
+      gradient_tensor_t gradient_tensor =
+          gradient_tensor_source.new_gradient_tensor();
+      allocate_gradient_tensor(local_tensor_backing,
+                               gradient_tensor,
+                               tensor_guid,
+                               tensor_attrs.shape,
+                               allocator);
     }
   }
 }
 
-bool LocalTensorBacking::is_tensor_allocated(lowered_tensor_t const & tensor_id) const {
-  return contains_key(tensor_backings, tensor_id);
+void allocate_all_optimizer_tensors(
+    LocalTensorBacking &local_tensor_backing,
+    OptimizerTensorSource &optimizer_tensor_source,
+    ComputationGraph const &computation_graph,
+    Allocator &allocator,
+    OptimizerAttrs const &optimizer_attrs) {
+  for (tensor_guid_t const &tensor_guid : get_all_tensors(computation_graph)) {
+    TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor_guid);
+    if (tensor_attrs.create_gradients == CreateGrad::YES) {
+      std::vector<optimizer_tensor_t> optimizer_tensors;
+      for (int i = 0; i < get_num_optimizer_tensors(optimizer_attrs); ++i) {
+        optimizer_tensors.push_back(
+            optimizer_tensor_source.new_optimizer_tensor());
+      }
+      allocate_optimizer_tensors(local_tensor_backing,
+                                 optimizer_tensors,
+                                 tensor_guid,
+                                 tensor_attrs.shape,
+                                 allocator);
+    }
+  }
 }
 
-GenericTensorAccessorW const &LocalTensorBacking::get_tensor_backing(
-    lowered_tensor_t const &tensor_id) const {
-  return this->tensor_backings.at(tensor_id);
+loss_tensor_t allocate_loss_tensor(LocalTensorBacking &local_tensor_backing,
+                                   LossTensorSource &loss_tensor_source,
+                                   TensorShape const &tensor_shape,
+                                   Allocator &allocator) {
+  loss_tensor_t loss_tensor = loss_tensor_source.new_loss_tensor();
+  lowered_tensor_t lowered_tensor =
+      local_tensor_backing.allocate_tensor(tensor_shape, allocator);
+  local_tensor_backing.loss_tensor_lowering_mapping.insert(
+      {loss_tensor, lowered_tensor});
+  return loss_tensor;
 }
 
-TensorSlotsBacking LocalTensorBacking::construct_tensor_slots_backing(
-    TaskBinding const &binding) const {
+TensorSlotsBacking construct_tensor_slots_backing(
+    LocalTensorBacking const &local_tensor_backing,
+    TaskBinding const &binding) {
   TensorSlotsBacking mapping;
 
   for (auto const &tensor_binding : binding.get_tensor_bindings()) {
     SlotTensorTypeId slot_tensor_type_id = tensor_binding.first;
 
-    lowered_tensor_t tensor_id = [&] {
-      TensorTypeVariant tensor_type = tensor_binding.second;
-      if (tensor_type.has<tensor_guid_t>() and slot_tensor_type_id.tensor_type == TensorType::FORWARD) {
-        return this->tensor_lowering_mapping.at(tensor_type.get<tensor_guid_t>());
-      } else if (tensor_type.has<tensor_guid_t>() and slot_tensor_type_id.tensor_type == TensorType::GRADIENT) {
-        return this->gradient_tensor_lowering_mapping.at(tensor_type.get<tensor_guid_t>());
-      } else if (tensor_type.has<optimizer_tensor_t>()) {
-        return this->optimizer_tensor_lowering_mapping.at(tensor_type.get<optimizer_tensor_t>());
-      } else if (tensor_type.has<loss_tensor_t>()) {
-        return this->loss_tensor_lowering_mapping.at(tensor_type.get<loss_tensor_t>());
-      } else {
-        throw mk_runtime_error(fmt::format("Tensor binding has invalid type"));
-      }
-    }();
+    lowered_tensor_t lowered_tensor =
+        tensor_binding.second.visit<lowered_tensor_t>(overload{
+            [&](tensor_guid_t const &t) {
+              return local_tensor_backing.tensor_lowering_mapping.at(t);
+            },
+            [&](gradient_tensor_t const &t) {
+              return local_tensor_backing.gradient_tensor_lowering_mapping.at(
+                  t);
+            },
+            [&](optimizer_tensor_t const &t) {
+              return local_tensor_backing.optimizer_tensor_lowering_mapping.at(
+                  t);
+            },
+            [&](loss_tensor_t const &t) {
+              return local_tensor_backing.loss_tensor_lowering_mapping.at(t);
+            },
+        });
 
-    GenericTensorAccessorW accessor = this->get_tensor_backing(tensor_id);
+    GenericTensorAccessorW accessor =
+        local_tensor_backing.tensor_backings.at(lowered_tensor);
     mapping.insert({slot_tensor_type_id, accessor});
   }
 
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index 22dc3b8397..4893d9be88 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -1,5 +1,6 @@
 #include "local-execution/local_training_backing.h"
 #include "local-execution/loss_functions.h"
+#include "local-execution/op_task_to_task_invocation.h"
 #include "local-execution/optimizer.h"
 #include "local-execution/task_invocation.h"
 #include "local-execution/task_signature_impl.h"
@@ -17,127 +18,152 @@ namespace FlexFlow {
 LocalTrainingBacking::LocalTrainingBacking(
     Allocator const &allocator,
     ComputationGraph const &computation_graph,
-    RuntimeArgConfig const &runtime_arg_config)
+    LocalTensorBacking const &local_tensor_backing,
+    LocalArgsBacking const &local_args_backing)
     : allocator(allocator), computation_graph(computation_graph),
-      local_args_backing(runtime_arg_config),
-      task_registry(empty_task_registry()) {};
-
-void LocalTrainingBacking::register_and_allocate_layer(
-    layer_guid_t const &node) {
-  ComputationGraphOpAttrs attrs =
-      get_layer_attrs(this->computation_graph, node).attrs;
-  this->local_tensor_backing.allocate_layer_tensors(
-      node, this->computation_graph, this->allocator);
-  register_tasks_for_layer(this->task_registry, node, attrs);
-}
-
-void LocalTrainingBacking::allocate_layer_optimizer_tensors(
-    layer_guid_t const &node, OptimizerAttrs const &optimizer_attrs) {
-  ComputationGraphOpAttrs attrs =
-      get_layer_attrs(this->computation_graph, node).attrs;
-  if (attrs.has<WeightAttrs>()) {
-    TaskSignature sig = get_update_signature(optimizer_attrs);
-    tensor_guid_t weight_tensor =
-        get_only(get_outgoing_tensors(this->computation_graph, node));
-
-    std::vector<optimizer_tensor_t> optimizer_tensors;
-    for (TensorTypeSlotSpec const & tensor_type_slot_spec: values(sig.tensor_guid_slots)) {
-      optimizer_tensors.push_back(this->optimizer_tensor_source.new_optimizer_tensor());
-    }
-    this->layer_optimizer_tensor_ids.insert({node, optimizer_tensors});
-    this->local_tensor_backing.allocate_optimizer_tensors(
-        weight_tensor, optimizer_tensors, this->allocator);
-  }
+      task_registry(empty_task_registry()),
+      local_tensor_backing(local_tensor_backing),
+      local_args_backing(local_args_backing) {
+  allocate_all_computation_graph_tensors(this->local_tensor_backing,
+                                         this->gradient_tensor_source,
+                                         this->computation_graph,
+                                         this->allocator);
+  register_all_computation_graph_tasks(this->task_registry,
+                                       this->computation_graph);
 }
 
 DeviceSpecificDeviceStates
-    LocalTrainingBacking::call_init_task_impl(task_id_t task_id,
-                                              TaskArgumentAccessor const &acc) {
-  TaskSignatureAndImpl task_sig_impl =
-      this->task_registry.task_mapping.at(task_id);
+    call_init_task_impl(TaskRegistry const &task_registry,
+                        task_id_t task_id,
+                        TaskArgumentAccessor const &acc) {
+  TaskSignatureAndImpl task_sig_impl = task_registry.task_mapping.at(task_id);
   auto fn =
       task_sig_impl.impl_function.get<InitOpTaskImplFunction>().function_ptr;
   return fn(acc);
 }
 
-std::optional<float>
-    LocalTrainingBacking::call_task_impl(task_id_t task_id,
-                                         TaskArgumentAccessor acc) {
-  TaskSignatureAndImpl task_sig_impl =
-      this->task_registry.task_mapping.at(task_id);
+std::optional<float> call_task_impl(TaskRegistry const &task_registry,
+                                    task_id_t task_id,
+                                    TaskArgumentAccessor acc) {
+  TaskSignatureAndImpl task_sig_impl = task_registry.task_mapping.at(task_id);
   auto fn =
       task_sig_impl.impl_function.get<FwdBwdOpTaskImplFunction>().function_ptr;
   return fn(acc);
 }
 
-void LocalTrainingBacking::execute_init(layer_guid_t const &operator_node) {
-  if (registry_contains_task_for_layer(
-          this->task_registry, operator_node, OpTaskType::INIT)) {
+void execute_init(LocalTrainingBacking &local_training_backing,
+                  layer_guid_t const &operator_node) {
+  if (registry_contains_task_for_layer(local_training_backing.task_registry,
+                                       operator_node,
+                                       OpTaskType::INIT)) {
     ComputationGraphOpAttrs attrs =
-        get_layer_attrs(this->computation_graph, operator_node).attrs;
-
-    TaskInvocation invocation = this->lower_to_task_invocation(init(attrs), operator_node);
+        get_layer_attrs(local_training_backing.computation_graph, operator_node)
+            .attrs;
+
+    TaskInvocation invocation =
+        lower_to_task_invocation(init(attrs),
+                                 operator_node,
+                                 local_training_backing.computation_graph,
+                                 std::nullopt);
     TaskArgumentAccessor accessor =
-        this->get_task_arg_accessor(invocation);
-    DeviceSpecificDeviceStates device_state =
-        this->call_init_task_impl(invocation.task_id, accessor);
-    this->local_args_backing.add_per_device_op_state(operator_node,
-                                                      device_state);
+        get_task_arg_accessor(local_training_backing.local_tensor_backing,
+                              local_training_backing.local_args_backing,
+                              invocation);
+    DeviceSpecificDeviceStates device_state = call_init_task_impl(
+        local_training_backing.task_registry, invocation.task_id, accessor);
+    add_per_device_op_state(
+        local_training_backing.local_args_backing, operator_node, device_state);
   }
 }
 
 std::optional<float>
-    LocalTrainingBacking::execute_forward(layer_guid_t const &operator_node) {
-  if (registry_contains_task_for_layer(
-          this->task_registry, operator_node, OpTaskType::FWD)) {
+    execute_forward(LocalTrainingBacking &local_training_backing,
+                    layer_guid_t const &operator_node) {
+  if (registry_contains_task_for_layer(local_training_backing.task_registry,
+                                       operator_node,
+                                       OpTaskType::FWD)) {
     ComputationGraphOpAttrs attrs =
-        get_layer_attrs(this->computation_graph, operator_node).attrs;
-
-    TaskInvocation invocation = this->lower_to_task_invocation(forward(attrs), operator_node);
+        get_layer_attrs(local_training_backing.computation_graph, operator_node)
+            .attrs;
+
+    std::optional<DeviceSpecificDeviceStates> device_state =
+        get_per_device_op_state_if_exists(
+            local_training_backing.local_args_backing, operator_node);
+    TaskInvocation invocation =
+        lower_to_task_invocation(forward(attrs),
+                                 operator_node,
+                                 local_training_backing.computation_graph,
+                                 device_state);
     TaskArgumentAccessor accessor =
-        this->get_task_arg_accessor(invocation);
-    return this->call_task_impl(invocation.task_id, accessor);
+        get_task_arg_accessor(local_training_backing.local_tensor_backing,
+                              local_training_backing.local_args_backing,
+                              invocation,
+                              local_training_backing.allocator);
+    return call_task_impl(
+        local_training_backing.task_registry, invocation.task_id, accessor);
   } else {
     return std::nullopt;
   }
 }
 
-void LocalTrainingBacking::compute_loss(LossAttrs const &loss_attrs,
-                                        tensor_guid_t const &logit_tensor,
-                                        loss_tensor_t const &label_tensor) {
+void compute_loss(LocalTrainingBacking const &local_training_backing,
+                  LossAttrs const &loss_attrs,
+                  tensor_guid_t const &logit_tensor,
+                  loss_tensor_t const &label_tensor) {
   TaskInvocation loss_invocation =
       backward(loss_attrs, logit_tensor, label_tensor);
   // TODO: https://github.com/flexflow/flexflow-train/issues/1442
   // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation));
   TaskArgumentAccessor loss_accessor =
-      this->get_task_arg_accessor(loss_invocation);
+      get_task_arg_accessor(local_training_backing.local_tensor_backing,
+                            local_training_backing.local_args_backing,
+                            loss_invocation);
   TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl();
   loss_impl_fn.get<GenericTaskImplFunction>().function_ptr(loss_accessor);
 }
 
 std::optional<float>
-    LocalTrainingBacking::execute_backward(layer_guid_t const &operator_node) {
-  if (registry_contains_task_for_layer(
-          this->task_registry, operator_node, OpTaskType::BWD)) {
+    execute_backward(LocalTrainingBacking &local_training_backing,
+                     layer_guid_t const &operator_node) {
+  if (registry_contains_task_for_layer(local_training_backing.task_registry,
+                                       operator_node,
+                                       OpTaskType::BWD)) {
     ComputationGraphOpAttrs attrs =
-        get_layer_attrs(this->computation_graph, operator_node).attrs;
-
-    TaskInvocation invocation = this->lower_to_task_invocation(backward(attrs), operator_node);
+        get_layer_attrs(local_training_backing.computation_graph, operator_node)
+            .attrs;
+
+    std::optional<DeviceSpecificDeviceStates> device_state =
+        get_per_device_op_state_if_exists(
+            local_training_backing.local_args_backing, operator_node);
+    TaskInvocation invocation =
+        lower_to_task_invocation(backward(attrs),
+                                 operator_node,
+                                 local_training_backing.computation_graph,
+                                 device_state);
     TaskArgumentAccessor accessor =
-        this->get_task_arg_accessor(invocation);
-    return this->call_task_impl(invocation.task_id, accessor);
+        get_task_arg_accessor(local_training_backing.local_tensor_backing,
+                              local_training_backing.local_args_backing,
+                              invocation,
+                              local_training_backing.allocator);
+    return call_task_impl(
+        local_training_backing.task_registry, invocation.task_id, accessor);
   } else {
     return std::nullopt;
   }
 }
 
-void LocalTrainingBacking::execute_update(
-    layer_guid_t const &node, OptimizerAttrs const &optimizer_attrs) {
-  LayerAttrs layer_attrs = get_layer_attrs(this->computation_graph, node);
+void execute_update(LocalTrainingBacking &local_training_backing,
+                    layer_guid_t const &node,
+                    OptimizerAttrs const &optimizer_attrs) {
+  LayerAttrs layer_attrs =
+      get_layer_attrs(local_training_backing.computation_graph, node);
   if (layer_attrs.attrs.has<WeightAttrs>()) {
     // get tensors
-    tensor_guid_t weight_tensor = get_only(get_outgoing_tensors(this->computation_graph, node));
-    std::vector<optimizer_tensor_t> optimizer_buffer_tensors = this->layer_optimizer_tensor_ids.at(node);
+    tensor_guid_t weight_tensor = get_only(
+        get_outgoing_tensors(local_training_backing.computation_graph, node));
+    std::vector<optimizer_tensor_t> optimizer_buffer_tensors =
+        local_training_backing.local_tensor_backing.tensor_optimizer_mapping.at(
+            weight_tensor);
 
     // get invocation
     TaskInvocation invocation = get_update_invocation(
@@ -148,62 +174,26 @@ void LocalTrainingBacking::execute_update(
 
     // execute update
     TaskArgumentAccessor accessor =
-        this->get_task_arg_accessor(invocation);
+        get_task_arg_accessor(local_training_backing.local_tensor_backing,
+                              local_training_backing.local_args_backing,
+                              invocation,
+                              local_training_backing.allocator);
     TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs);
     update_impl_fn.get<GenericTaskImplFunction>().function_ptr(accessor);
   }
 }
 
-TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor(
-    TaskInvocation const &invocation) const {
+TaskArgumentAccessor
+    get_task_arg_accessor(LocalTensorBacking const &local_tensor_backing,
+                          LocalArgsBacking const &local_args_backing,
+                          TaskInvocation const &invocation,
+                          Allocator &allocator) {
   TensorSlotsBacking tensor_slots_backing =
-      this->local_tensor_backing.construct_tensor_slots_backing(
-          invocation.binding);
-  ArgSlotsBacking arg_slots_backing =
-      this->local_args_backing.construct_arg_slots_backing(invocation.binding);
+      construct_tensor_slots_backing(local_tensor_backing, invocation.binding);
+  ArgSlotsBacking arg_slots_backing = construct_arg_slots_backing(
+      invocation.binding, local_args_backing.runtime_arg_config);
   return TaskArgumentAccessor::create<LocalTaskArgumentAccessor>(
-      this->allocator, tensor_slots_backing, arg_slots_backing);
-}
-
-TaskInvocation LocalTrainingBacking::lower_to_task_invocation(OpTaskInvocation const & op_task_invocation, layer_guid_t const & layer_guid) const {
-  TaskBinding binding;
-  // tensors
-  for (auto const & tensor_binding: op_task_invocation.binding.get_tensor_bindings()) {
-    tensor_guid_t tensor_to_bind = [&] {
-      switch (tensor_binding.second.role) {
-        case TensorRole::INPUT:
-          return get_incoming_inputs(this->computation_graph, layer_guid).at(tensor_binding.second.idx);
-        case TensorRole::OUTPUT:
-          return get_outgoing_tensors(this->computation_graph, layer_guid).at(tensor_binding.second.idx);
-        case TensorRole::WEIGHT:
-          return get_incoming_weights(this->computation_graph, layer_guid).at(tensor_binding.second.idx);
-        default:
-          throw mk_runtime_error(fmt::format("Invalid tensor role {}", tensor_binding.second.role));
-      }
-    }(); 
-
-    if (tensor_binding.first.is_grad == IsGrad::NO) {
-      binding.bind(tensor_binding.first.slot_id, tensor_to_bind);
-    } else if (tensor_binding.first.is_grad == IsGrad::YES) {
-      binding.bind_grad(tensor_binding.first.slot_id, tensor_to_bind);
-    } else {
-      throw mk_runtime_error(fmt::format("Invalid value for IsGrad {}", tensor_binding.first.is_grad));
-    }
-  }
-
-  // args
-  for (auto const & arg_binding: op_task_invocation.binding.get_arg_bindings()) {
-    if (arg_binding.second.has<OpArgRefSpec>()) {
-      ConcreteArgSpec concrete_arg = this->local_args_backing.lower_to_concrete_arg_spec(arg_binding.second.get<OpArgRefSpec>(), this->computation_graph, layer_guid);
-      binding.insert_arg_spec(arg_binding.first, TaskArgSpec{concrete_arg});
-    } else if (arg_binding.second.has<RuntimeArgRefSpec>()) {
-      binding.insert_arg_spec(arg_binding.first, TaskArgSpec{arg_binding.second.get<RuntimeArgRefSpec>()});
-    } else {
-      binding.insert_arg_spec(arg_binding.first, TaskArgSpec{arg_binding.second.get<ConcreteArgSpec>()});
-    }
-  }
-
-  return TaskInvocation{op_task_invocation.task_id, binding};
+      allocator, tensor_slots_backing, arg_slots_backing);
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc
index bfb3c0a32b..0a89dfd9d5 100644
--- a/lib/local-execution/src/loss_functions.cc
+++ b/lib/local-execution/src/loss_functions.cc
@@ -33,9 +33,8 @@ TaskSignature get_loss_bwd_signature() {
   return sig;
 }
 
-TaskInvocation backward(LossAttrs const &attrs,
-                        tensor_guid_t logit,
-                        loss_tensor_t label) {
+TaskInvocation
+    backward(LossAttrs const &attrs, tensor_guid_t logit, loss_tensor_t label) {
   TaskBinding b;
   b.bind(LOGIT, logit);
   b.bind(LABEL, label);
diff --git a/lib/local-execution/src/loss_tensor_source.cc b/lib/local-execution/src/loss_tensor_source.cc
new file mode 100644
index 0000000000..da1efa6b85
--- /dev/null
+++ b/lib/local-execution/src/loss_tensor_source.cc
@@ -0,0 +1,13 @@
+#include "local-execution/loss_tensor_source.h"
+
+namespace FlexFlow {
+
+size_t LossTensorSource::next_available_loss_tensor_id = 0;
+
+LossTensorSource::LossTensorSource() {}
+
+loss_tensor_t LossTensorSource::new_loss_tensor() {
+  return loss_tensor_t{LossTensorSource::next_available_loss_tensor_id++};
+}
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/lowered_tensor_source.cc b/lib/local-execution/src/lowered_tensor_source.cc
index 05960ff5e2..af80aa2335 100644
--- a/lib/local-execution/src/lowered_tensor_source.cc
+++ b/lib/local-execution/src/lowered_tensor_source.cc
@@ -7,7 +7,8 @@ size_t LoweredTensorSource::next_available_lowered_tensor_id = 0;
 LoweredTensorSource::LoweredTensorSource() {}
 
 lowered_tensor_t LoweredTensorSource::new_lowered_tensor() {
-  return lowered_tensor_t{LoweredTensorSource::next_available_lowered_tensor_id++};
+  return lowered_tensor_t{
+      LoweredTensorSource::next_available_lowered_tensor_id++};
 }
 
-}
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc
index f57c5db73a..6691bd3a03 100644
--- a/lib/local-execution/src/model_training_instance.cc
+++ b/lib/local-execution/src/model_training_instance.cc
@@ -6,68 +6,66 @@
 namespace FlexFlow {
 
 ModelTrainingInstance::ModelTrainingInstance(
-    Allocator const &allocator,
-    ComputationGraph const &computation_graph,
-    RuntimeArgConfig const &runtime_arg_config,
+    LocalTrainingBacking const &local_training_backing,
+    tensor_guid_t const & logit_tensor,
+    TensorShape const &label_tensor_shape,
     LossAttrs const &loss_attrs,
-    tensor_guid_t const &logit_tensor,
-    loss_tensor_t const &label_tensor,
     OptimizerAttrs const &optimizer_attrs)
-    : computation_graph(computation_graph),
-      training_backing(allocator,
-                       computation_graph,
-                       runtime_arg_config),
-      loss_attrs(loss_attrs), logit_tensor(logit_tensor),
-      label_tensor(label_tensor), optimizer_attrs(optimizer_attrs) {
-
-  // allocate each layer's tensors
-  for (layer_guid_t const &node :
-       topological_ordering(this->computation_graph)) {
-    this->training_backing.register_and_allocate_layer(node);
-    this->training_backing.allocate_layer_optimizer_tensors(
-        node, this->optimizer_attrs);
-  }
+    : training_backing(local_training_backing), loss_attrs(loss_attrs),
+      optimizer_attrs(optimizer_attrs), logit_tensor(logit_tensor),
+      label_tensor(
+          allocate_loss_tensor(this->training_backing.local_tensor_backing,
+                               this->loss_tensor_source,
+                               label_tensor_shape,
+                               this->training_backing.allocator)) {
+  allocate_all_optimizer_tensors(this->training_backing.local_tensor_backing,
+                                 this->optimizer_tensor_source,
+                                 this->training_backing.computation_graph,
+                                 this->training_backing.allocator,
+                                 this->optimizer_attrs);
 }
 
-void ModelTrainingInstance::execute_init() {
-  for (layer_guid_t const &node :
-       topological_ordering(this->computation_graph)) {
-    this->training_backing.execute_init(node);
+void init(ModelTrainingInstance &model_training_instance) {
+  for (layer_guid_t const &node : topological_ordering(
+           model_training_instance.training_backing.computation_graph)) {
+    execute_init(model_training_instance.training_backing, node);
   }
 }
 
-PerLayerElapsedTime ModelTrainingInstance::execute_forward() {
+PerLayerElapsedTime forward(ModelTrainingInstance &model_training_instance) {
   PerLayerElapsedTime per_layer_elapsed_time;
-  for (layer_guid_t const &node :
-       topological_ordering(this->computation_graph)) {
+  for (layer_guid_t const &node : topological_ordering(
+           model_training_instance.training_backing.computation_graph)) {
     std::optional<float> elapsed_time =
-        this->training_backing.execute_forward(node);
+        execute_forward(model_training_instance.training_backing, node);
     per_layer_elapsed_time.insert({node, elapsed_time});
   }
   return per_layer_elapsed_time;
 }
 
-PerLayerElapsedTime ModelTrainingInstance::execute_backward() {
-  this->training_backing.compute_loss(
-      this->loss_attrs, this->logit_tensor, this->label_tensor);
+PerLayerElapsedTime backward(ModelTrainingInstance &model_training_instance) {
+  compute_loss(model_training_instance.training_backing,
+               model_training_instance.loss_attrs, 
+               model_training_instance.logit_tensor,
+               model_training_instance.label_tensor);
 
   PerLayerElapsedTime per_layer_elapsed_time;
-  for (layer_guid_t const &node :
-       reversed(topological_ordering(this->computation_graph))) {
+  for (layer_guid_t const &node : reversed(topological_ordering(
+           model_training_instance.training_backing.computation_graph))) {
     std::optional<float> elapsed_time =
-        this->training_backing.execute_backward(node);
+        execute_backward(model_training_instance.training_backing, node);
     per_layer_elapsed_time.insert({node, elapsed_time});
   }
   return per_layer_elapsed_time;
 }
 
-void ModelTrainingInstance::execute_update() {
+void update(ModelTrainingInstance & model_training_instance) {
   for (layer_guid_t const &node :
-       topological_ordering(this->computation_graph)) {
-    this->training_backing.execute_update(node, this->optimizer_attrs);
+       topological_ordering(model_training_instance.training_backing.computation_graph)) {
+    execute_update(model_training_instance.training_backing, node, model_training_instance.optimizer_attrs);
   }
-  this->optimizer_attrs =
-      get_optimizer_attrs_for_next_iter(this->optimizer_attrs);
+  model_training_instance.optimizer_attrs =
+      get_optimizer_attrs_for_next_iter(model_training_instance.optimizer_attrs);
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/op_arg_spec.cc b/lib/local-execution/src/op_arg_spec.cc
similarity index 100%
rename from lib/local-execution/src/local-execution/op_arg_spec.cc
rename to lib/local-execution/src/op_arg_spec.cc
diff --git a/lib/local-execution/src/op_task_invocation.cc b/lib/local-execution/src/op_task_invocation.cc
index b6771e6eb8..19c8894b05 100644
--- a/lib/local-execution/src/op_task_invocation.cc
+++ b/lib/local-execution/src/op_task_invocation.cc
@@ -20,8 +20,7 @@ void OpTaskBinding::bind(int slot, OpTensorSpec const &tensor_spec) {
 }
 
 void OpTaskBinding::bind(slot_id_t slot, OpTensorSpec const &tensor_spec) {
-  this->tensor_bindings.insert(
-      {SlotGradId{slot, IsGrad::NO}, tensor_spec});
+  this->tensor_bindings.insert({SlotGradId{slot, IsGrad::NO}, tensor_spec});
 }
 
 void OpTaskBinding::bind_grad(int slot, OpTensorSpec const &tensor_spec) {
@@ -29,8 +28,7 @@ void OpTaskBinding::bind_grad(int slot, OpTensorSpec const &tensor_spec) {
 }
 
 void OpTaskBinding::bind_grad(slot_id_t slot, OpTensorSpec const &tensor_spec) {
-  this->tensor_bindings.insert(
-      {SlotGradId{slot, IsGrad::YES}, tensor_spec});
+  this->tensor_bindings.insert({SlotGradId{slot, IsGrad::YES}, tensor_spec});
 }
 
 void OpTaskBinding::insert_arg_spec(slot_id_t name, OpArgSpec const &arg_spec) {
@@ -91,8 +89,8 @@ bool is_tensor_invocation_valid(OpTaskSignature const &sig,
                                 OpTaskInvocation const &inv) {
   auto tensor_bindings = inv.binding.get_tensor_bindings();
   for (OpTensorSlotSpec const &op_tensor_slot_spec : sig.get_tensor_slots()) {
-    SlotGradId tensor_key = SlotGradId{
-        op_tensor_slot_spec.name, op_tensor_slot_spec.is_grad};
+    SlotGradId tensor_key =
+        SlotGradId{op_tensor_slot_spec.name, op_tensor_slot_spec.is_grad};
     OpTensorSpec op_tensor_spec = tensor_bindings.at(tensor_key);
     if (is_op_tensor_spec_invalid(op_tensor_slot_spec, op_tensor_spec)) {
       return false;
diff --git a/lib/local-execution/src/op_task_signature.cc b/lib/local-execution/src/op_task_signature.cc
index 69b5463a0d..932b330453 100644
--- a/lib/local-execution/src/op_task_signature.cc
+++ b/lib/local-execution/src/op_task_signature.cc
@@ -12,12 +12,8 @@ void OpTaskSignature::add_input_slot(int name, SlotType slot_type) {
 }
 
 void OpTaskSignature::add_input_slot(slot_id_t name, SlotType slot_type) {
-  OpTensorSlotSpec op_tensor_slot_spec =
-      OpTensorSlotSpec{name,
-                       slot_type,
-                       TensorRole::INPUT,
-                       IsGrad::NO,
-                       OpSlotOptions::NECESSARY};
+  OpTensorSlotSpec op_tensor_slot_spec = OpTensorSlotSpec{
+      name, slot_type, TensorRole::INPUT, IsGrad::NO, OpSlotOptions::NECESSARY};
   this->op_tensor_slots.insert(op_tensor_slot_spec);
 }
 
@@ -27,12 +23,8 @@ void OpTaskSignature::add_optional_input_slot(int name, SlotType slot_type) {
 
 void OpTaskSignature::add_optional_input_slot(slot_id_t name,
                                               SlotType slot_type) {
-  OpTensorSlotSpec op_tensor_slot_spec =
-      OpTensorSlotSpec{name,
-                       slot_type,
-                       TensorRole::INPUT,
-                       IsGrad::NO,
-                       OpSlotOptions::OPTIONAL};
+  OpTensorSlotSpec op_tensor_slot_spec = OpTensorSlotSpec{
+      name, slot_type, TensorRole::INPUT, IsGrad::NO, OpSlotOptions::OPTIONAL};
   this->op_tensor_slots.insert(op_tensor_slot_spec);
 }
 
@@ -88,12 +80,8 @@ void OpTaskSignature::add_bwd_optional_output_slot(int name,
 
 void OpTaskSignature::add_bwd_optional_output_slot(slot_id_t name,
                                                    SlotType slot_type) {
-  OpTensorSlotSpec op_tensor_slot_spec =
-      OpTensorSlotSpec{name,
-                       slot_type,
-                       TensorRole::OUTPUT,
-                       IsGrad::NO,
-                       OpSlotOptions::OPTIONAL};
+  OpTensorSlotSpec op_tensor_slot_spec = OpTensorSlotSpec{
+      name, slot_type, TensorRole::OUTPUT, IsGrad::NO, OpSlotOptions::OPTIONAL};
   this->op_tensor_slots.insert(op_tensor_slot_spec);
 }
 
@@ -117,12 +105,8 @@ void OpTaskSignature::add_optional_weight_slot(int name, SlotType slot_type) {
 
 void OpTaskSignature::add_optional_weight_slot(slot_id_t name,
                                                SlotType slot_type) {
-  OpTensorSlotSpec op_tensor_slot_spec =
-      OpTensorSlotSpec{name,
-                       slot_type,
-                       TensorRole::WEIGHT,
-                       IsGrad::NO,
-                       OpSlotOptions::OPTIONAL};
+  OpTensorSlotSpec op_tensor_slot_spec = OpTensorSlotSpec{
+      name, slot_type, TensorRole::WEIGHT, IsGrad::NO, OpSlotOptions::OPTIONAL};
   this->op_tensor_slots.insert(op_tensor_slot_spec);
 }
 
diff --git a/lib/local-execution/src/op_task_to_task_invocation.cc b/lib/local-execution/src/op_task_to_task_invocation.cc
new file mode 100644
index 0000000000..eb6dffabc4
--- /dev/null
+++ b/lib/local-execution/src/op_task_to_task_invocation.cc
@@ -0,0 +1,108 @@
+#include "local-execution/op_task_to_task_invocation.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "pcg/computation_graph.h"
+
+namespace FlexFlow {
+
+TaskInvocation lower_to_task_invocation(
+    OpTaskInvocation const &op_task_invocation,
+    layer_guid_t const &layer_guid,
+    ComputationGraph const &computation_graph,
+    std::optional<DeviceSpecificDeviceStates> const &device_states) {
+  TaskBinding binding;
+  // tensors
+  std::vector<tensor_guid_t> input_tensors =
+      get_incoming_inputs(computation_graph, layer_guid);
+  std::vector<tensor_guid_t> output_tensors =
+      get_outgoing_tensors(computation_graph, layer_guid);
+  std::vector<tensor_guid_t> weight_tensors =
+      get_incoming_weights(computation_graph, layer_guid);
+
+  for (auto const &tensor_binding :
+       op_task_invocation.binding.get_tensor_bindings()) {
+    tensor_guid_t tensor_to_bind = [&] {
+      OpTensorSpec tensor_binding_spec = tensor_binding.second;
+      switch (tensor_binding_spec.role) {
+        case TensorRole::INPUT:
+          return input_tensors.at(tensor_binding_spec.idx);
+        case TensorRole::OUTPUT:
+          return output_tensors.at(tensor_binding_spec.idx);
+        case TensorRole::WEIGHT:
+          return weight_tensors.at(tensor_binding_spec.idx);
+        default:
+          throw mk_runtime_error(
+              fmt::format("Invalid tensor role {}", tensor_binding_spec.role));
+      }
+    }();
+
+    SlotGradId slot_grad_id = tensor_binding.first;
+
+    if (slot_grad_id.is_grad == IsGrad::NO) {
+      binding.bind(slot_grad_id.slot_id, tensor_to_bind);
+    } else if (slot_grad_id.is_grad == IsGrad::YES) {
+      binding.bind_grad(slot_grad_id.slot_id, tensor_to_bind);
+    } else {
+      throw mk_runtime_error(fmt::format("Invalid value for IsGrad {}",
+                                         tensor_binding.first.is_grad));
+    }
+  }
+
+  // args
+  for (auto const &arg_binding :
+       op_task_invocation.binding.get_arg_bindings()) {
+    if (arg_binding.second.has<OpArgRefSpec>()) {
+      ConcreteArgSpec concrete_arg =
+          lower_to_concrete_arg_spec(arg_binding.second.get<OpArgRefSpec>(),
+                                     computation_graph,
+                                     layer_guid,
+                                     device_states);
+      binding.insert_arg_spec(arg_binding.first, TaskArgSpec{concrete_arg});
+    } else if (arg_binding.second.has<RuntimeArgRefSpec>()) {
+      binding.insert_arg_spec(
+          arg_binding.first,
+          TaskArgSpec{arg_binding.second.get<RuntimeArgRefSpec>()});
+    } else {
+      binding.insert_arg_spec(
+          arg_binding.first,
+          TaskArgSpec{arg_binding.second.get<ConcreteArgSpec>()});
+    }
+  }
+
+  return TaskInvocation{op_task_invocation.task_id, binding};
+}
+
+ConcreteArgSpec lower_to_concrete_arg_spec(
+    OpArgRefSpec const &op_arg_ref_spec,
+    ComputationGraph const &cg,
+    layer_guid_t const &op_guid,
+    std::optional<DeviceSpecificDeviceStates> const &device_states) {
+  if (op_arg_ref_spec.holds<DeviceSpecificDeviceStates>()) {
+    PerDeviceOpState device_state =
+        get_device_state_from_device_specific(device_states.value(), 0);
+    return ConcreteArgSpec::create(device_state);
+  } else if (op_arg_ref_spec.holds<ParallelTensorShape>()) {
+    ParallelTensorShapeRefType index_op_arg_ref =
+        op_arg_ref_spec.get_ref_type().get<ParallelTensorShapeRefType>();
+    tensor_guid_t input_tensor =
+        get_incoming_inputs(cg, op_guid).at(index_op_arg_ref.idx);
+    TensorAttrs tensor_attrs = get_tensor_attrs(cg, input_tensor);
+    ParallelTensorShape shape = lift_to_parallel(tensor_attrs.shape);
+    return ConcreteArgSpec::create(shape);
+  } else {
+    throw mk_runtime_error("Unhandled op arg ref type");
+  }
+}
+
+ConcreteArgSpec
+    lower_to_concrete_arg_spec(RuntimeArgRefSpec const &runtime_arg_ref_spec,
+                               RuntimeArgConfig const &runtime_arg_config) {
+  if (runtime_arg_ref_spec.holds<DeviceSpecific<PerDeviceFFHandle>>()) {
+    return ConcreteArgSpec::create(*(runtime_arg_config.ff_handle.get(0)));
+  } else if (runtime_arg_ref_spec.holds<ProfilingSettings>()) {
+    return ConcreteArgSpec::create(runtime_arg_config.profiling_settings);
+  } else {
+    throw mk_runtime_error("Unhandled runtime arg ref type");
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/optimizer_tensor_source.cc b/lib/local-execution/src/optimizer_tensor_source.cc
index 8adb8ec07b..c241c7f4bd 100644
--- a/lib/local-execution/src/optimizer_tensor_source.cc
+++ b/lib/local-execution/src/optimizer_tensor_source.cc
@@ -7,7 +7,8 @@ size_t OptimizerTensorSource::next_available_optimizer_tensor_id = 0;
 OptimizerTensorSource::OptimizerTensorSource() {}
 
 optimizer_tensor_t OptimizerTensorSource::new_optimizer_tensor() {
-  return optimizer_tensor_t{OptimizerTensorSource::next_available_optimizer_tensor_id++};
+  return optimizer_tensor_t{
+      OptimizerTensorSource::next_available_optimizer_tensor_id++};
 }
 
-}
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/task_binding.cc b/lib/local-execution/src/task_binding.cc
index 6fc8449f0b..f0aac85ea1 100644
--- a/lib/local-execution/src/task_binding.cc
+++ b/lib/local-execution/src/task_binding.cc
@@ -1,49 +1,45 @@
 #include "local-execution/task_binding.h"
+#include "pcg/tensor_guid_t.dtg.h"
 #include "utils/containers/contains_key.h"
 #include "utils/fmt/unordered_map.h"
 #include "utils/hash/unordered_map.h"
-#include "pcg/tensor_guid_t.dtg.h"
 
 namespace FlexFlow {
 
-void TaskBinding::bind(int name,
-                       tensor_guid_t const &binding) {
+void TaskBinding::bind(int name, tensor_guid_t const &binding) {
   this->bind(slot_id_t{name}, binding);
 }
 
-void TaskBinding::bind(slot_id_t name,
-                       tensor_guid_t const &binding) {
-  this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::FORWARD}, TensorTypeVariant{binding}});
+void TaskBinding::bind(slot_id_t name, tensor_guid_t const &binding) {
+  this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::FORWARD},
+                                TensorTypeVariant{binding}});
 }
 
-void TaskBinding::bind_grad(int name,
-                       tensor_guid_t const &binding) {
+void TaskBinding::bind_grad(int name, tensor_guid_t const &binding) {
   this->bind(slot_id_t{name}, binding);
 }
 
-void TaskBinding::bind_grad(slot_id_t name,
-                       tensor_guid_t const &binding) {
-  this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::GRADIENT}, TensorTypeVariant{binding}});
+void TaskBinding::bind_grad(slot_id_t name, tensor_guid_t const &binding) {
+  this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::GRADIENT},
+                                TensorTypeVariant{binding}});
 }
 
-void TaskBinding::bind(int name,
-                       optimizer_tensor_t const &binding) {
+void TaskBinding::bind(int name, optimizer_tensor_t const &binding) {
   this->bind(slot_id_t{name}, binding);
 }
 
-void TaskBinding::bind(slot_id_t name,
-                       optimizer_tensor_t const &binding) {
-  this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::OPTIMIZER}, TensorTypeVariant{binding}});
+void TaskBinding::bind(slot_id_t name, optimizer_tensor_t const &binding) {
+  this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::OPTIMIZER},
+                                TensorTypeVariant{binding}});
 }
 
-void TaskBinding::bind(int name,
-                       loss_tensor_t const &binding) {
+void TaskBinding::bind(int name, loss_tensor_t const &binding) {
   this->bind(slot_id_t{name}, binding);
 }
 
-void TaskBinding::bind(slot_id_t name,
-                       loss_tensor_t const &binding) {
-  this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::LOSS}, TensorTypeVariant{binding}});
+void TaskBinding::bind(slot_id_t name, loss_tensor_t const &binding) {
+  this->tensor_bindings.insert(
+      {SlotTensorTypeId{name, TensorType::LOSS}, TensorTypeVariant{binding}});
 }
 
 void TaskBinding::insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec) {
@@ -91,12 +87,12 @@ std::ostream &operator<<(std::ostream &s, TaskBinding const &x) {
 
 namespace std {
 
-size_t hash<::FlexFlow::TaskBinding>::operator() (
-  ::FlexFlow::TaskBinding const &s) const {
-    size_t result = 0;
-    hash_combine(result, s.get_tensor_bindings());
-    hash_combine(result, s.get_arg_bindings());
-    return result;
-  }
+size_t hash<::FlexFlow::TaskBinding>::operator()(
+    ::FlexFlow::TaskBinding const &s) const {
+  size_t result = 0;
+  hash_combine(result, s.get_tensor_bindings());
+  hash_combine(result, s.get_arg_bindings());
+  return result;
+}
 
 } // namespace std
diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc
index be1cf73e11..9b7b55633c 100644
--- a/lib/local-execution/src/task_registry.cc
+++ b/lib/local-execution/src/task_registry.cc
@@ -1,5 +1,6 @@
 #include "local-execution/task_registry.h"
 #include "local-execution/task_signature_impl.h"
+#include "pcg/computation_graph.h"
 
 namespace FlexFlow {
 
@@ -43,8 +44,8 @@ void register_tasks_for_layer(TaskRegistry &task_registry,
 }
 
 bool registry_contains_task_for_layer(TaskRegistry const &task_registry,
-                               layer_guid_t const &op,
-                               OpTaskType const &op_task_type) {
+                                      layer_guid_t const &op,
+                                      OpTaskType const &op_task_type) {
   std::unordered_map<layer_guid_t, std::optional<task_id_t>> task_ids;
   switch (op_task_type) {
     case OpTaskType::INIT:
@@ -63,4 +64,12 @@ bool registry_contains_task_for_layer(TaskRegistry const &task_registry,
   return task_ids.at(op).has_value();
 }
 
+void register_all_computation_graph_tasks(TaskRegistry &registry,
+                                          ComputationGraph const &cg) {
+  for (layer_guid_t const &node : topological_ordering(cg)) {
+    ComputationGraphOpAttrs attrs = get_layer_attrs(cg, node).attrs;
+    register_tasks_for_layer(registry, node, attrs);
+  }
+}
+
 } // namespace FlexFlow
diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc
index 88dfa34783..5f7c1ddb91 100644
--- a/lib/local-execution/test/src/test_local_slots_backing.cc
+++ b/lib/local-execution/test/src/test_local_slots_backing.cc
@@ -1,24 +1,24 @@
 #include "kernels/attention_kernels.h"
 #include "local-execution/local_cost_estimator.h"
 #include "local-execution/local_cpu_allocator.h"
-#include "local-execution/local_slots_backing.h"
-#include "local-execution/tensor_reduction.h"
+#include "local-execution/local_tensor_backing.h"
+#include "local-execution/tensor_lowering.h"
 #include "op-attrs/ops/attention.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "pcg/computation_graph.h"
 #include "pcg/computation_graph_builder.h"
-#include "utils/containers/get_only.h"
 #include "test/utils/doctest/fmt/pair.h"
 #include "test/utils/doctest/fmt/unordered_map.h"
 #include "test/utils/doctest/fmt/variant.h"
 #include "test/utils/doctest/fmt/vector.h"
 #include "test_utils.h"
+#include "utils/containers/get_only.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("LocalSlotsBacking -- Attention Op") {
+  TEST_CASE("LocalTensorBacking -- Attention Op") {
     // allocate input memory
     Allocator allocator = create_local_cpu_memory_allocator();
     int embed_dim = 32;
@@ -69,10 +69,10 @@ TEST_SUITE(FF_TEST_SUITE) {
         get_layer_by_name(cg_builder.computation_graph, layer_name);
 
     LayerTensorBackingMap layer_tensor_backing_map = {
-      {LayerTensorKey{layer_guid, lower(query_guid)}, query}, 
-      {LayerTensorKey{layer_guid, lower(key_guid)}, key}, 
-      {LayerTensorKey{layer_guid, lower(value_guid)}, value},
-      //{LayerTensorKey{layer_guid, lower(output_guid), output}}
+        {LayerTensorKey{layer_guid, lower(query_guid)}, query},
+        {LayerTensorKey{layer_guid, lower(key_guid)}, key},
+        {LayerTensorKey{layer_guid, lower(value_guid)}, value},
+        //{LayerTensorKey{layer_guid, lower(output_guid), output}}
     };
 
     // runtime arg config
@@ -84,13 +84,13 @@ TEST_SUITE(FF_TEST_SUITE) {
                          EnableProfiling::NO,
                          settings};
 
-    LocalSlotsBacking local_slots_backing = {layer_tensor_backing_map,
-                                             TensorBackingMap{},
-                                             runtime_arg_config};
+    LocalTensorBacking local_tensor_backing = {
+        layer_tensor_backing_map, TensorBackingMap{}, runtime_arg_config};
 
-    SUBCASE("LocalSlotsBacking::allocate_tensors_by_role") {
+    SUBCASE("LocalTensorBacking::allocate_tensors_by_role") {
       auto get_result_shape_and_dtype_for_tensor_guid_and_map =
-          [&](tensor_guid_t t, layer_guid_t l,
+          [&](tensor_guid_t t,
+              layer_guid_t l,
               LayerTensorBackingMap m) -> std::pair<ArrayShape, DataType> {
         GenericTensorAccessorW accessor = m.at(LayerTensorKey{l, lower(t)});
         return get_shape_and_datatype(accessor);
@@ -99,7 +99,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       SUBCASE("Input (QKV) and gradient tensors allocation") {
 
         // allocate all tensors from input nodes
-        local_slots_backing.allocate_tensors_by_role(
+        local_tensor_backing.allocate_tensors_by_role(
             TensorRole::INPUT,
             layer_guid,
             cg_builder.computation_graph,
@@ -108,7 +108,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         SUBCASE("Query grad") {
           std::pair<ArrayShape, DataType> result =
               get_result_shape_and_dtype_for_tensor_guid_and_map(
-                  query_guid, layer_guid, local_slots_backing.gradient_tensor_mapping);
+                  query_guid,
+                  layer_guid,
+                  local_tensor_backing.gradient_tensor_mapping);
           std::pair<ArrayShape, DataType> correct = {ArrayShape{query_shape},
                                                      dtype};
           CHECK(result == correct);
@@ -116,7 +118,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         SUBCASE("Key grad") {
           std::pair<ArrayShape, DataType> result =
               get_result_shape_and_dtype_for_tensor_guid_and_map(
-                  key_guid, layer_guid, local_slots_backing.gradient_tensor_mapping);
+                  key_guid,
+                  layer_guid,
+                  local_tensor_backing.gradient_tensor_mapping);
           std::pair<ArrayShape, DataType> correct = {ArrayShape{key_shape},
                                                      dtype};
           CHECK(result == correct);
@@ -124,14 +128,16 @@ TEST_SUITE(FF_TEST_SUITE) {
         SUBCASE("Value grad") {
           std::pair<ArrayShape, DataType> result =
               get_result_shape_and_dtype_for_tensor_guid_and_map(
-                  value_guid, layer_guid, local_slots_backing.gradient_tensor_mapping);
+                  value_guid,
+                  layer_guid,
+                  local_tensor_backing.gradient_tensor_mapping);
           std::pair<ArrayShape, DataType> correct = {ArrayShape{value_shape},
                                                      dtype};
           CHECK(result == correct);
         }
       }
       SUBCASE("Output and gradient tensors allocation") {
-        local_slots_backing.allocate_tensors_by_role(
+        local_tensor_backing.allocate_tensors_by_role(
             TensorRole::OUTPUT,
             layer_guid,
             cg_builder.computation_graph,
@@ -139,7 +145,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         SUBCASE("Output") {
           std::pair<ArrayShape, DataType> result =
               get_result_shape_and_dtype_for_tensor_guid_and_map(
-                  output_guid, layer_guid, local_slots_backing.tensor_mapping);
+                  output_guid, layer_guid, local_tensor_backing.tensor_mapping);
           std::pair<ArrayShape, DataType> correct = {
               ArrayShape{
                   get_tensor_attrs(cg_builder.computation_graph, output_guid)
@@ -150,7 +156,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         SUBCASE("Output grad") {
           std::pair<ArrayShape, DataType> result =
               get_result_shape_and_dtype_for_tensor_guid_and_map(
-                  output_guid, layer_guid, local_slots_backing.gradient_tensor_mapping);
+                  output_guid,
+                  layer_guid,
+                  local_tensor_backing.gradient_tensor_mapping);
           std::pair<ArrayShape, DataType> correct = {
               ArrayShape{
                   get_tensor_attrs(cg_builder.computation_graph, output_guid)
@@ -161,31 +169,36 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
 
       SUBCASE("Tensor slots") {
-        local_slots_backing.allocate_layer_tensors(
+        local_tensor_backing.allocate_layer_tensors(
             layer_guid, cg_builder.computation_graph, allocator);
         SUBCASE("Input tensor slots") {
-          std::vector<reduced_tensor_t> correct_incoming_input_tensors = 
-            transform(get_incoming_inputs(cg_builder.computation_graph, layer_guid), [&](tensor_guid_t const &tensor_guid) {
-              return lower(tensor_guid);
-            });
+          std::vector<lowered_tensor_t> correct_incoming_input_tensors =
+              transform(
+                  get_incoming_inputs(cg_builder.computation_graph, layer_guid),
+                  [&](tensor_guid_t const &tensor_guid) {
+                    return lower(tensor_guid);
+                  });
           CHECK(correct_incoming_input_tensors ==
-                local_slots_backing.input_tensor_slots.at(layer_guid));
+                local_tensor_backing.input_tensor_slots.at(layer_guid));
         }
         SUBCASE("Weight tensor slots") {
-          std::vector<reduced_tensor_t> correct_incoming_weight_tensors = 
-            transform(get_incoming_weights(cg_builder.computation_graph, layer_guid), [&](tensor_guid_t const &tensor_guid) {
-              return lower(tensor_guid);
-            });
+          std::vector<lowered_tensor_t> correct_incoming_weight_tensors =
+              transform(get_incoming_weights(cg_builder.computation_graph,
+                                             layer_guid),
+                        [&](tensor_guid_t const &tensor_guid) {
+                          return lower(tensor_guid);
+                        });
           CHECK(correct_incoming_weight_tensors ==
-                local_slots_backing.weight_tensor_slots.at(layer_guid));
+                local_tensor_backing.weight_tensor_slots.at(layer_guid));
         }
         SUBCASE("Output tensor slots") {
-          std::vector<reduced_tensor_t> correct_output_tensors = 
-            transform(get_outgoing_tensors(cg_builder.computation_graph, layer_guid), [&](tensor_guid_t const &tensor_guid) {
-              return lower(tensor_guid);
-            });
+          std::vector<lowered_tensor_t> correct_output_tensors = transform(
+              get_outgoing_tensors(cg_builder.computation_graph, layer_guid),
+              [&](tensor_guid_t const &tensor_guid) {
+                return lower(tensor_guid);
+              });
           CHECK(correct_output_tensors ==
-                local_slots_backing.output_tensor_slots.at(layer_guid));
+                local_tensor_backing.output_tensor_slots.at(layer_guid));
         }
       }
     }
@@ -224,14 +237,14 @@ TEST_SUITE(FF_TEST_SUITE) {
         return b;
       }();
 
-      local_slots_backing.allocate_layer_tensors(
+      local_tensor_backing.allocate_layer_tensors(
           layer_guid, cg_builder.computation_graph, allocator);
 
-      SUBCASE("LocalSlotsBacking::construct_tensor_slots_backing") {
+      SUBCASE("LocalTensorBacking::construct_tensor_slots_backing") {
         TensorSlotsBackingWithoutAddresses result =
             get_slots_backing_without_tensor_allocation_addresses(
-                local_slots_backing.construct_tensor_slots_backing(binding,
-                                                                   layer_guid));
+                local_tensor_backing.construct_tensor_slots_backing(
+                    binding, layer_guid));
         TensorSlotsBackingWithoutAddresses correct = [&] {
           TensorShape weights_shape = throw_if_unexpected(
               get_weights_shape(attrs, query_shape, key_shape, value_shape));
@@ -244,20 +257,25 @@ TEST_SUITE(FF_TEST_SUITE) {
               allocator.allocate_tensor(output_attrs.shape);
           return get_slots_backing_without_tensor_allocation_addresses(
               TensorSlotsBacking{
-                  {SlotTensorTypeId{slot_id_t{QUERY}, TensorType::FORWARD}, query},
+                  {SlotTensorTypeId{slot_id_t{QUERY}, TensorType::FORWARD},
+                   query},
                   {SlotTensorTypeId{slot_id_t{KEY}, TensorType::FORWARD}, key},
-                  {SlotTensorTypeId{slot_id_t{VALUE}, TensorType::FORWARD}, value},
-                  {SlotTensorTypeId{slot_id_t{WEIGHTS}, TensorType::FORWARD}, weights},
-                  {SlotTensorTypeId{slot_id_t{OUTPUT}, TensorType::FORWARD}, output},
-                  {SlotTensorTypeId{slot_id_t{QUERY}, TensorType::GRADIENT}, query}});
+                  {SlotTensorTypeId{slot_id_t{VALUE}, TensorType::FORWARD},
+                   value},
+                  {SlotTensorTypeId{slot_id_t{WEIGHTS}, TensorType::FORWARD},
+                   weights},
+                  {SlotTensorTypeId{slot_id_t{OUTPUT}, TensorType::FORWARD},
+                   output},
+                  {SlotTensorTypeId{slot_id_t{QUERY}, TensorType::GRADIENT},
+                   query}});
         }();
 
         CHECK(result == correct);
       }
-      SUBCASE("LocalSlotsBacking::construct_arg_slots_backing") {
+      SUBCASE("LocalTensorBacking::construct_arg_slots_backing") {
         ArgSlotsBacking result =
-            local_slots_backing.construct_arg_slots_backing(binding,
-                                                            layer_guid);
+            local_tensor_backing.construct_arg_slots_backing(binding,
+                                                             layer_guid);
 
         ArgSlotsBacking correct = [&] {
           ParallelTensorShape query_parallel_tensor_shape =
@@ -277,10 +295,10 @@ TEST_SUITE(FF_TEST_SUITE) {
         CHECK(result == correct);
       }
 
-      SUBCASE("LocalSlotsBacking::resolve_runtime_arg_ref_spec") {
+      SUBCASE("LocalTensorBacking::resolve_runtime_arg_ref_spec") {
         RuntimeArgRefSpec ref_spec = RuntimeArgRefSpec::create(ff_handle());
         ConcreteArgSpec arg_spec =
-            local_slots_backing.resolve_runtime_arg_ref_spec(ref_spec);
+            local_tensor_backing.resolve_runtime_arg_ref_spec(ref_spec);
 
         PerDeviceFFHandle result_handle = arg_spec.get<PerDeviceFFHandle>();
         CHECK(result_handle == handle);
diff --git a/lib/local-execution/test/src/test_local_task_arg_accessor.cc b/lib/local-execution/test/src/test_local_task_arg_accessor.cc
index bddda7acd1..979e4360d7 100644
--- a/lib/local-execution/test/src/test_local_task_arg_accessor.cc
+++ b/lib/local-execution/test/src/test_local_task_arg_accessor.cc
@@ -39,7 +39,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorSlotsBacking tensor_slots_backing = {
         {SlotTensorTypeId{slot_id_t{INPUT}, TensorType::FORWARD}, input},
         {SlotTensorTypeId{slot_id_t{INPUT}, TensorType::GRADIENT}, input_grad},
-        {SlotTensorTypeId{slot_id_t{VARIADIC_TENSORS}, TensorType::FORWARD}, variadic_tensors},
+        {SlotTensorTypeId{slot_id_t{VARIADIC_TENSORS}, TensorType::FORWARD},
+         variadic_tensors},
         {SlotTensorTypeId{slot_id_t{VARIADIC_TENSORS}, TensorType::GRADIENT},
          variadic_tensors_grad},
     };
@@ -50,45 +51,46 @@ TEST_SUITE(FF_TEST_SUITE) {
       SUBCASE("get_tensor(slot_id_t, Permissions::RO, TensorType::FORWARD)") {
         GenericTensorAccessor correct = GenericTensorAccessor{
             read_only_accessor_from_write_accessor(input)};
-        GenericTensorAccessor result =
-            acc.get_tensor(slot_id_t{INPUT}, Permissions::RO, TensorType::FORWARD);
+        GenericTensorAccessor result = acc.get_tensor(
+            slot_id_t{INPUT}, Permissions::RO, TensorType::FORWARD);
         CHECK(correct == result);
       }
       SUBCASE("get_tensor(slot_id_t, Permissions::RO, TensorType::GRADIENT)") {
         GenericTensorAccessor correct = GenericTensorAccessor{
             read_only_accessor_from_write_accessor(input_grad)};
-        GenericTensorAccessor result =
-            acc.get_tensor(slot_id_t{INPUT}, Permissions::RO, TensorType::GRADIENT);
+        GenericTensorAccessor result = acc.get_tensor(
+            slot_id_t{INPUT}, Permissions::RO, TensorType::GRADIENT);
         CHECK(correct == result);
       }
       SUBCASE("get_tensor(slot_id_t, Permissions::WO, TensorType::FORWARD)") {
         GenericTensorAccessor correct = GenericTensorAccessor{input};
-        GenericTensorAccessor result =
-            acc.get_tensor(slot_id_t{INPUT}, Permissions::WO, TensorType::FORWARD);
+        GenericTensorAccessor result = acc.get_tensor(
+            slot_id_t{INPUT}, Permissions::WO, TensorType::FORWARD);
         CHECK(correct == result);
       }
       SUBCASE("get_tensor(slot_id_t, Permissions::WO, TensorType::GRADIENT)") {
         GenericTensorAccessor correct = GenericTensorAccessor{input_grad};
-        GenericTensorAccessor result =
-            acc.get_tensor(slot_id_t{INPUT}, Permissions::WO, TensorType::GRADIENT);
+        GenericTensorAccessor result = acc.get_tensor(
+            slot_id_t{INPUT}, Permissions::WO, TensorType::GRADIENT);
         CHECK(correct == result);
       }
       SUBCASE("get_tensor(slot_id_t, Permissions::RW, TensorType::FORWARD)") {
         GenericTensorAccessor correct = GenericTensorAccessor{input};
-        GenericTensorAccessor result =
-            acc.get_tensor(slot_id_t{INPUT}, Permissions::RW, TensorType::FORWARD);
+        GenericTensorAccessor result = acc.get_tensor(
+            slot_id_t{INPUT}, Permissions::RW, TensorType::FORWARD);
         CHECK(correct == result);
       }
       SUBCASE("get_tensor(slot_id_t, Permissions::RW, TensorType::GRADIENT)") {
         GenericTensorAccessor correct = GenericTensorAccessor{input_grad};
-        GenericTensorAccessor result =
-            acc.get_tensor(slot_id_t{INPUT}, Permissions::RW, TensorType::GRADIENT);
+        GenericTensorAccessor result = acc.get_tensor(
+            slot_id_t{INPUT}, Permissions::RW, TensorType::GRADIENT);
         CHECK(correct == result);
       }
     }
 
     SUBCASE("get_variadic_tensor") {
-      SUBCASE("get_variadic_tensor(slot_id_t, Permissions::RO, TensorType::FORWARD)") {
+      SUBCASE("get_variadic_tensor(slot_id_t, Permissions::RO, "
+              "TensorType::FORWARD)") {
         VariadicGenericTensorAccessor correct =
             VariadicGenericTensorAccessor{std::vector<GenericTensorAccessorR>{
                 read_only_accessor_from_write_accessor(variadic_tensors.at(0)),
@@ -98,7 +100,8 @@ TEST_SUITE(FF_TEST_SUITE) {
             slot_id_t{VARIADIC_TENSORS}, Permissions::RO, TensorType::FORWARD);
         CHECK(result == correct);
       }
-      SUBCASE("get_variadic_tensor(slot_id_t, Permissions::RO, TensorType::GRADIENT)") {
+      SUBCASE("get_variadic_tensor(slot_id_t, Permissions::RO, "
+              "TensorType::GRADIENT)") {
         VariadicGenericTensorAccessor correct =
             VariadicGenericTensorAccessor{std::vector<GenericTensorAccessorR>{
                 read_only_accessor_from_write_accessor(
@@ -109,28 +112,32 @@ TEST_SUITE(FF_TEST_SUITE) {
             slot_id_t{VARIADIC_TENSORS}, Permissions::RO, TensorType::GRADIENT);
         CHECK(result == correct);
       }
-      SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, TensorType::FORWARD)") {
+      SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, "
+              "TensorType::FORWARD)") {
         VariadicGenericTensorAccessor correct =
             VariadicGenericTensorAccessor{variadic_tensors};
         VariadicGenericTensorAccessor result = acc.get_variadic_tensor(
             slot_id_t{VARIADIC_TENSORS}, Permissions::WO, TensorType::FORWARD);
         CHECK(result == correct);
       }
-      SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, TensorType::GRADIENT)") {
+      SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, "
+              "TensorType::GRADIENT)") {
         VariadicGenericTensorAccessor correct =
             VariadicGenericTensorAccessor{variadic_tensors_grad};
         VariadicGenericTensorAccessor result = acc.get_variadic_tensor(
             slot_id_t{VARIADIC_TENSORS}, Permissions::WO, TensorType::GRADIENT);
         CHECK(result == correct);
       }
-      SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, TensorType::FORWARD)") {
+      SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, "
+              "TensorType::FORWARD)") {
         VariadicGenericTensorAccessor correct =
             VariadicGenericTensorAccessor{variadic_tensors};
         VariadicGenericTensorAccessor result = acc.get_variadic_tensor(
             slot_id_t{VARIADIC_TENSORS}, Permissions::RW, TensorType::FORWARD);
         CHECK(result == correct);
       }
-      SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, TensorType::GRADIENT)") {
+      SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, "
+              "TensorType::GRADIENT)") {
         VariadicGenericTensorAccessor correct =
             VariadicGenericTensorAccessor{variadic_tensors_grad};
         VariadicGenericTensorAccessor result = acc.get_variadic_tensor(
diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc
index 5793d02f31..210cd1af83 100644
--- a/lib/local-execution/test/src/test_loss_e2e.cc
+++ b/lib/local-execution/test/src/test_loss_e2e.cc
@@ -1,9 +1,9 @@
 #include "doctest/doctest.h"
-#include "local-execution/tensor_reduction.h"
 #include "kernels/local_cuda_allocator.h"
 #include "kernels/managed_ff_stream.h"
 #include "kernels/managed_per_device_ff_handle.h"
 #include "local-execution/local_training_backing.h"
+#include "local-execution/tensor_lowering.h"
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
 #include "pcg/computation_graph.h"
 #include "pcg/computation_graph_builder.h"
@@ -36,7 +36,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     std::string layer_name = "scalar multiply";
     tensor_guid_t logit_tensor =
         cg_builder.scalar_multiply(input_tensor, scalar, layer_name);
-    layer_guid_t layer_guid = get_layer_by_name(cg_builder.computation_graph, layer_name);
+    layer_guid_t layer_guid =
+        get_layer_by_name(cg_builder.computation_graph, layer_name);
 
     // allocate memory
     Allocator allocator = create_local_cuda_memory_allocator();
@@ -52,37 +53,42 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     SUBCASE("SparseCategoricalCrossEntropyLossAttrs") {
       TensorShape label_shape = TensorShape{
           TensorDims{FFOrdered<size_t>{batch_size, 1}}, DataType::FLOAT};
-      reduced_tensor_t label_tensor = reduced_tensor_t{-1};
+      lowered_tensor_t label_tensor = lowered_tensor_t{-1};
       GenericTensorAccessorW label_backing =
           allocator.allocate_tensor(label_shape);
-      local_backing.local_slots_backing.non_graph_tensor_mapping.insert({label_tensor, label_backing});
+      local_backing.local_tensor_backing.non_graph_tensor_mapping.insert(
+          {label_tensor, label_backing});
       LossAttrs loss_attrs = LossAttrs{
           SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}};
       local_backing.compute_loss(loss_attrs, lower(logit_tensor), label_tensor);
     }
 
     SUBCASE("NonconfigurableLossAttrs") {
-      reduced_tensor_t label_tensor = reduced_tensor_t{-1};
+      lowered_tensor_t label_tensor = lowered_tensor_t{-1};
       GenericTensorAccessorW label_backing =
           allocator.allocate_tensor(input_shape);
-      local_backing.local_slots_backing.non_graph_tensor_mapping.insert({label_tensor, label_backing});
+      local_backing.local_tensor_backing.non_graph_tensor_mapping.insert(
+          {label_tensor, label_backing});
 
       SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") {
         LossAttrs loss_attrs = LossAttrs{
             NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
-        local_backing.compute_loss(loss_attrs, lower(logit_tensor), label_tensor);
+        local_backing.compute_loss(
+            loss_attrs, lower(logit_tensor), label_tensor);
       }
 
       SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") {
         LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{
             LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}};
-        local_backing.compute_loss(loss_attrs, lower(logit_tensor), label_tensor);
+        local_backing.compute_loss(
+            loss_attrs, lower(logit_tensor), label_tensor);
       }
 
       SUBCASE("LossFunction::IDENTITY") {
         LossAttrs loss_attrs =
             LossAttrs{NonconfigurableLossAttrs{LossFunction::IDENTITY}};
-        local_backing.compute_loss(loss_attrs, lower(logit_tensor), label_tensor);
+        local_backing.compute_loss(
+            loss_attrs, lower(logit_tensor), label_tensor);
       }
     }
   }
diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc
index 2e5e386a95..d16c5e5b0b 100644
--- a/lib/local-execution/test/src/test_update_e2e.cc
+++ b/lib/local-execution/test/src/test_update_e2e.cc
@@ -3,7 +3,7 @@
 #include "kernels/managed_ff_stream.h"
 #include "kernels/managed_per_device_ff_handle.h"
 #include "local-execution/local_training_backing.h"
-#include "local-execution/tensor_reduction.h"
+#include "local-execution/tensor_lowering.h"
 #include "pcg/computation_graph.h"
 #include "pcg/computation_graph_builder.h"
 #include "pcg/optimizer_attrs.dtg.h"
diff --git a/lib/op-attrs/include/op-attrs/operator_attrs.h b/lib/op-attrs/include/op-attrs/operator_attrs.h
index 483e735196..d94f7af4fb 100644
--- a/lib/op-attrs/include/op-attrs/operator_attrs.h
+++ b/lib/op-attrs/include/op-attrs/operator_attrs.h
@@ -9,6 +9,7 @@
 #include "op-attrs/ops/combine.h"
 #include "op-attrs/ops/concat.h"
 #include "op-attrs/ops/conv_2d.h"
+#include "op-attrs/ops/core.h"
 #include "op-attrs/ops/dropout.h"
 #include "op-attrs/ops/element_binary.h"
 #include "op-attrs/ops/element_unary.h"
@@ -30,7 +31,6 @@
 #include "op-attrs/ops/split.h"
 #include "op-attrs/ops/topk.h"
 #include "op-attrs/ops/transpose.h"
-#include "op-attrs/ops/core.h"
 #include "op-attrs/pcg_operator_attrs.dtg.h"
 #include "utils/record_formatter.h"
 #include "utils/variant.h"
diff --git a/lib/pcg/include/pcg/computation_graph.h b/lib/pcg/include/pcg/computation_graph.h
index f70d9f7404..e3a8cc662c 100644
--- a/lib/pcg/include/pcg/computation_graph.h
+++ b/lib/pcg/include/pcg/computation_graph.h
@@ -37,6 +37,8 @@ std::vector<tensor_guid_t> get_incoming_inputs(ComputationGraph const &,
 std::vector<tensor_guid_t> get_incoming_weights(ComputationGraph const &,
                                                 layer_guid_t const &);
 
+std::unordered_set<tensor_guid_t> get_all_tensors(ComputationGraph const &);
+
 std::unordered_set<ComputationGraphEdge>
     get_subgraph_incoming_edges(ComputationGraph const &,
                                 std::unordered_set<layer_guid_t> const &);
diff --git a/lib/pcg/include/pcg/optimizer_attrs.h b/lib/pcg/include/pcg/optimizer_attrs.h
index 1d74694c29..f18722d1bb 100644
--- a/lib/pcg/include/pcg/optimizer_attrs.h
+++ b/lib/pcg/include/pcg/optimizer_attrs.h
@@ -7,6 +7,7 @@
 namespace FlexFlow {
 
 OptimizerAttrs get_optimizer_attrs_for_next_iter(OptimizerAttrs const &old);
+int get_num_optimizer_tensors(OptimizerAttrs const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/pcg/src/pcg/computation_graph.cc b/lib/pcg/src/pcg/computation_graph.cc
index a69e54fd93..d358dc5031 100644
--- a/lib/pcg/src/pcg/computation_graph.cc
+++ b/lib/pcg/src/pcg/computation_graph.cc
@@ -121,6 +121,11 @@ std::vector<tensor_guid_t> get_incoming_weights(ComputationGraph const &cg,
   return get_incoming_tensors_with_role(cg, l, IncomingTensorRole::WEIGHT);
 }
 
+std::unordered_set<tensor_guid_t> get_all_tensors(ComputationGraph const &cg) {
+  return transform(get_all_dataflow_outputs(cg.raw_graph),
+                   [](DataflowOutput const &t) { return tensor_guid_t(t); });
+}
+
 std::unordered_set<ComputationGraphEdge> get_subgraph_incoming_edges(
     ComputationGraph const &cg,
     std::unordered_set<layer_guid_t> const &subgraph_nodes) {
diff --git a/lib/pcg/src/pcg/optimizer_attrs.cc b/lib/pcg/src/pcg/optimizer_attrs.cc
index ce2d3d0db7..7a37091428 100644
--- a/lib/pcg/src/pcg/optimizer_attrs.cc
+++ b/lib/pcg/src/pcg/optimizer_attrs.cc
@@ -1,4 +1,5 @@
 #include "pcg/optimizer_attrs.h"
+#include "utils/overload.h"
 
 namespace FlexFlow {
 
@@ -22,4 +23,16 @@ OptimizerAttrs
   }
 }
 
+int get_num_optimizer_tensors(OptimizerAttrs const &attrs) {
+  return attrs.visit<int>(
+      overload{[&](SGDOptimizerAttrs const &o) {
+                 if (o.momentum > 0.0f) {
+                   return 1;
+                 } else {
+                   return 0;
+                 }
+               },
+               [&](AdamOptimizerAttrs const &) { return 2; }});
+}
+
 } // namespace FlexFlow

From 277f8c268632dfcc5622d96f55b65751d063d736 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Fri, 31 Jan 2025 19:36:45 -0800
Subject: [PATCH 33/91] Update task binding interface and cost estimator

---
 .../local-execution/local_training_backing.h  |   3 +-
 .../include/local-execution/loss_functions.h  |   8 +-
 .../local-execution/model_training_instance.h |   4 +-
 .../op_task_to_task_invocation.h              |  11 +-
 .../include/local-execution/optimizer.h       |   3 +
 .../local-execution/task_argument_accessor.h  |  45 ++++----
 .../include/local-execution/task_binding.h    |  12 +-
 .../include/local-execution/task_registry.h   |   1 +
 .../src/local_cost_estimator.cc               | 103 ++++++++++++------
 .../src/local_training_backing.cc             |  59 ++++++----
 lib/local-execution/src/loss_functions.cc     |  18 +--
 .../src/model_training_instance.cc            |  18 +--
 .../src/op_task_to_task_invocation.cc         |   5 +-
 lib/local-execution/src/optimizer.cc          |  47 +++++---
 lib/local-execution/src/task_binding.cc       |  19 ++--
 .../include/pcg/computation_graph_builder.h   |   6 -
 .../parallel_computation_graph_builder.h      |   4 -
 lib/pcg/src/pcg/computation_graph_builder.cc  |  14 +--
 .../parallel_computation_graph_builder.cc     |   2 +-
 19 files changed, 220 insertions(+), 162 deletions(-)

diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h
index b712be9a93..ef5e7ec41e 100644
--- a/lib/local-execution/include/local-execution/local_training_backing.h
+++ b/lib/local-execution/include/local-execution/local_training_backing.h
@@ -25,7 +25,6 @@ struct LocalTrainingBacking {
   ComputationGraph computation_graph;
   TaskRegistry task_registry;
 
-private:
   GradientTensorSource gradient_tensor_source;
 };
 
@@ -42,7 +41,7 @@ std::optional<float> execute_forward(LocalTrainingBacking &,
                                      layer_guid_t const &);
 std::optional<float> execute_backward(LocalTrainingBacking &,
                                       layer_guid_t const &);
-void compute_loss(LocalTrainingBacking const &,
+void compute_loss(LocalTrainingBacking &,
                   LossAttrs const &,
                   tensor_guid_t const &logit_tensor,
                   loss_tensor_t const &label_tensor);
diff --git a/lib/local-execution/include/local-execution/loss_functions.h b/lib/local-execution/include/local-execution/loss_functions.h
index f56f2b05b1..b2a6d610c3 100644
--- a/lib/local-execution/include/local-execution/loss_functions.h
+++ b/lib/local-execution/include/local-execution/loss_functions.h
@@ -16,19 +16,21 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_
 #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_
 
+#include "local-execution/loss_tensor_t.dtg.h"
 #include "local-execution/task_impl_function.dtg.h"
 #include "local-execution/task_invocation.dtg.h"
 #include "local-execution/task_signature.h"
 #include "op-attrs/ops/loss_functions.h"
 #include "pcg/tensor_guid_t.dtg.h"
-#include "local-execution/loss_tensor_t.dtg.h"
 
 namespace FlexFlow {
 
 TaskImplFunction get_loss_bwd_task_impl();
 TaskSignature get_loss_bwd_signature();
-TaskInvocation
-    backward(LossAttrs const &, tensor_guid_t logit, loss_tensor_t label);
+TaskInvocation backward(LossAttrs const &,
+                        tensor_guid_t logit,
+                        gradient_tensor_t logit_grad,
+                        loss_tensor_t label);
 
 } // namespace FlexFlow
 
diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h
index 81aacf2a53..bf0fc1a3c0 100644
--- a/lib/local-execution/include/local-execution/model_training_instance.h
+++ b/lib/local-execution/include/local-execution/model_training_instance.h
@@ -13,8 +13,8 @@ using PerLayerElapsedTime =
 
 struct ModelTrainingInstance {
   ModelTrainingInstance(LocalTrainingBacking const &,
-                        tensor_guid_t const & logit_tensor,
-                        TensorShape const & label_tensor_shape,
+                        tensor_guid_t const &logit_tensor,
+                        TensorShape const &label_tensor_shape,
                         LossAttrs const &,
                         OptimizerAttrs const &);
 
diff --git a/lib/local-execution/include/local-execution/op_task_to_task_invocation.h b/lib/local-execution/include/local-execution/op_task_to_task_invocation.h
index 44e10d4b51..02b3c938b0 100644
--- a/lib/local-execution/include/local-execution/op_task_to_task_invocation.h
+++ b/lib/local-execution/include/local-execution/op_task_to_task_invocation.h
@@ -10,11 +10,12 @@
 
 namespace FlexFlow {
 
-TaskInvocation
-    lower_to_task_invocation(OpTaskInvocation const &,
-                             layer_guid_t const &,
-                             ComputationGraph const &,
-                             std::optional<DeviceSpecificDeviceStates> const &);
+TaskInvocation lower_to_task_invocation(
+    OpTaskInvocation const &,
+    layer_guid_t const &,
+    ComputationGraph const &,
+    std::unordered_map<tensor_guid_t, gradient_tensor_t> const &,
+    std::optional<DeviceSpecificDeviceStates> const &);
 
 ConcreteArgSpec lower_to_concrete_arg_spec(RuntimeArgRefSpec const &,
                                            RuntimeArgConfig const &);
diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h
index f0dd610a1f..3a092e34c6 100644
--- a/lib/local-execution/include/local-execution/optimizer.h
+++ b/lib/local-execution/include/local-execution/optimizer.h
@@ -14,18 +14,21 @@ TaskSignature get_update_signature(OptimizerAttrs const &);
 TaskInvocation get_update_invocation(
     OptimizerAttrs const &,
     tensor_guid_t const &weight,
+    gradient_tensor_t const &weight_grad,
     std::vector<optimizer_tensor_t> const &grad_buffer_tensors);
 TaskImplFunction get_update_task_impl(OptimizerAttrs const &);
 
 TaskSignature get_sgd_update_signature();
 TaskInvocation sgd_update(SGDOptimizerAttrs const &,
                           tensor_guid_t const &weight,
+                          gradient_tensor_t const &weight_grad,
                           optimizer_tensor_t const &sgd_v);
 TaskImplFunction get_sgd_update_task_impl();
 
 TaskSignature get_adam_update_signature();
 TaskInvocation adam_update(AdamOptimizerAttrs const &,
                            tensor_guid_t const &weight,
+                           gradient_tensor_t const &weight_grad,
                            optimizer_tensor_t const &adam_v,
                            optimizer_tensor_t const &adam_m);
 TaskImplFunction get_adam_update_task_impl();
diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h
index 16a63a789b..0cbeaf04c8 100644
--- a/lib/local-execution/include/local-execution/task_argument_accessor.h
+++ b/lib/local-execution/include/local-execution/task_argument_accessor.h
@@ -50,7 +50,7 @@ struct TaskArgumentAccessor {
 
   template <Permissions PRIV>
   privilege_mode_to_accessor<PRIV> get_optimizer_tensor(int slot) const {
-    return this->get_tensor_grad<PRIV>(slot_id_t{slot});
+    return this->get_optimizer_tensor<PRIV>(slot_id_t{slot});
   }
 
   template <Permissions PRIV>
@@ -59,17 +59,16 @@ struct TaskArgumentAccessor {
         this->ptr->get_tensor(slot, PRIV, TensorType::OPTIMIZER));
   }
 
-  // template <Permissions PRIV>
-  // privilege_mode_to_accessor<PRIV> get_non_graph_tensor(int slot) const {
-  //   return this->get_tensor_grad<PRIV>(slot_id_t{slot});
-  // }
+  template <Permissions PRIV>
+  privilege_mode_to_accessor<PRIV> get_loss_tensor(int slot) const {
+    return this->get_loss_tensor<PRIV>(slot_id_t{slot});
+  }
 
-  // template <Permissions PRIV>
-  // privilege_mode_to_accessor<PRIV> get_non_graph_tensor(slot_id_t slot) const
-  // {
-  //   return std::get<privilege_mode_to_accessor<PRIV>>(
-  //       this->ptr->get_tensor(slot, PRIV, TensorType::NON_GRAPH));
-  // }
+  template <Permissions PRIV>
+  privilege_mode_to_accessor<PRIV> get_loss_tensor(slot_id_t slot) const {
+    return std::get<privilege_mode_to_accessor<PRIV>>(
+        this->ptr->get_tensor(slot, PRIV, TensorType::LOSS));
+  }
 
   // variadic tensors
   template <Permissions PRIV>
@@ -101,7 +100,7 @@ struct TaskArgumentAccessor {
   template <Permissions PRIV>
   std::vector<privilege_mode_to_accessor<PRIV>>
       get_variadic_optimizer_tensor(int slot) const {
-    return this->get_variadic_tensor_grad<PRIV>(slot_id_t{slot});
+    return this->get_variadic_optimizer_tensor<PRIV>(slot_id_t{slot});
   }
 
   template <Permissions PRIV>
@@ -111,18 +110,18 @@ struct TaskArgumentAccessor {
         this->ptr->get_variadic_tensor(slot, PRIV, TensorType::OPTIMIZER));
   }
 
-  // template <Permissions PRIV>
-  // std::vector<privilege_mode_to_accessor<PRIV>>
-  //     get_variadic_non_graph_tensor(int slot) const {
-  //   return this->get_variadic_tensor_grad<PRIV>(slot_id_t{slot});
-  // }
+  template <Permissions PRIV>
+  std::vector<privilege_mode_to_accessor<PRIV>>
+      get_variadic_loss_tensor(int slot) const {
+    return this->get_variadic_loss_tensor<PRIV>(slot_id_t{slot});
+  }
 
-  // template <Permissions PRIV>
-  // std::vector<privilege_mode_to_accessor<PRIV>>
-  //     get_variadic_non_graph_tensor(slot_id_t slot) const {
-  //   return std::get<std::vector<privilege_mode_to_accessor<PRIV>>>(
-  //       this->ptr->get_variadic_tensor(slot, PRIV, TensorType::NON_GRAPH));
-  // }
+  template <Permissions PRIV>
+  std::vector<privilege_mode_to_accessor<PRIV>>
+      get_variadic_loss_tensor(slot_id_t slot) const {
+    return std::get<std::vector<privilege_mode_to_accessor<PRIV>>>(
+        this->ptr->get_variadic_tensor(slot, PRIV, TensorType::LOSS));
+  }
 
   Allocator get_allocator() const {
     return this->ptr->get_allocator();
diff --git a/lib/local-execution/include/local-execution/task_binding.h b/lib/local-execution/include/local-execution/task_binding.h
index 21fc813a6b..aba0c01a65 100644
--- a/lib/local-execution/include/local-execution/task_binding.h
+++ b/lib/local-execution/include/local-execution/task_binding.h
@@ -19,14 +19,14 @@ struct TaskBinding {
   void bind(int, tensor_guid_t const &);
   void bind(slot_id_t, tensor_guid_t const &);
 
-  void bind_grad(int, tensor_guid_t const &);
-  void bind_grad(slot_id_t, tensor_guid_t const &);
+  void bind_grad(int, gradient_tensor_t const &);
+  void bind_grad(slot_id_t, gradient_tensor_t const &);
 
-  void bind(int, optimizer_tensor_t const &);
-  void bind(slot_id_t, optimizer_tensor_t const &);
+  void bind_optimizer(int, optimizer_tensor_t const &);
+  void bind_optimizer(slot_id_t, optimizer_tensor_t const &);
 
-  void bind(int, loss_tensor_t const &);
-  void bind(slot_id_t, loss_tensor_t const &);
+  void bind_loss(int, loss_tensor_t const &);
+  void bind_loss(slot_id_t, loss_tensor_t const &);
 
   template <typename T>
   void bind_arg(int name, T const &t) {
diff --git a/lib/local-execution/include/local-execution/task_registry.h b/lib/local-execution/include/local-execution/task_registry.h
index 1669822c83..cb717ca2af 100644
--- a/lib/local-execution/include/local-execution/task_registry.h
+++ b/lib/local-execution/include/local-execution/task_registry.h
@@ -5,6 +5,7 @@
 #include "local-execution/op_task_type.dtg.h"
 #include "local-execution/task_registry.dtg.h"
 #include "op-attrs/computation_graph_op_attrs.h"
+#include "pcg/computation_graph.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
index b959f31a8b..24175a5ee8 100644
--- a/lib/local-execution/src/local_cost_estimator.cc
+++ b/lib/local-execution/src/local_cost_estimator.cc
@@ -8,6 +8,7 @@
 #include "pcg/computation_graph/layer_added_result.dtg.h"
 #include "pcg/computation_graph_builder.h"
 #include "pcg/parallel_tensor_attrs.h"
+#include "utils/containers/concat_vectors.h"
 #include "utils/containers/sum.h"
 #include "utils/containers/transform.h"
 #include "utils/containers/values.h"
@@ -17,6 +18,53 @@ namespace FlexFlow {
 LocalCostEstimator::LocalCostEstimator(RuntimeArgConfig const &config)
     : runtime_arg_config(config) {}
 
+static ComputationGraph const &
+    create_computation_graph_for_local_cost_estimation(
+        PCGOperatorAttrs const &op,
+        std::vector<ParallelTensorShape> const &inputs,
+        std::vector<ParallelTensorAttrs> const &weights,
+        std::vector<ParallelTensorAttrs> const &outputs) {
+  ComputationGraph computation_graph = make_empty_computation_graph();
+
+  // create layer for inputs
+  auto get_vector_piece_attrs_from_parallel_tensor_shape =
+      [](std::vector<ParallelTensorShape> const &parallel_shapes) {
+        return transform(parallel_shapes, [](ParallelTensorShape const &p) {
+          return TensorAttrs{
+              get_piece_shape(p), std::nullopt, std::nullopt, CreateGrad::YES};
+        });
+      };
+
+  LayerAddedResult inputs_layer =
+      add_layer(computation_graph,
+                LayerAttrs{ComputationGraphOpAttrs{InputAttrs{}}, "inputs"},
+                {},
+                get_vector_piece_attrs_from_parallel_tensor_shape(inputs));
+
+  // create layer for weights
+  auto get_vector_piece_attrs_from_parallel_tensor_attrs =
+      [](std::vector<ParallelTensorAttrs> const &parallel_attrs) {
+        return transform(parallel_attrs, [](ParallelTensorAttrs const &p) {
+          return get_piece_attrs(p);
+        });
+      };
+
+  LayerAddedResult weights_layer =
+      add_layer(computation_graph,
+                LayerAttrs{ComputationGraphOpAttrs{InputAttrs{}}, "weights"},
+                {},
+                get_vector_piece_attrs_from_parallel_tensor_attrs(weights));
+
+  // create operator layer
+  LayerAddedResult operator_layer = add_layer(
+      computation_graph,
+      LayerAttrs{compgraph_op_attrs_from_pcg_op_attrs(op), "operator"},
+      concat_vectors(inputs_layer.outputs, weights_layer.outputs),
+      get_vector_piece_attrs_from_parallel_tensor_attrs(outputs));
+
+  return computation_graph;
+}
+
 CostDetails LocalCostEstimator::estimate_cost(
     PCGOperatorAttrs const &op,
     std::vector<ParallelTensorShape> const &inputs,
@@ -29,47 +77,34 @@ CostDetails LocalCostEstimator::estimate_cost(
     return CostDetails{0, 0};
   }
 
-  LayerAttrs layer_attrs =
-      LayerAttrs{compgraph_op_attrs_from_pcg_op_attrs(op), std::nullopt};
+  // construct computation graph
+  ComputationGraph computation_graph =
+      create_computation_graph_for_local_cost_estimation(
+          op, inputs, weights, outputs);
 
-  // allocate memory for inputs
+  // allocate memory
   std::shared_ptr<TrackedAllocator> tracked_allocator_ptr =
       std::make_shared<TrackedAllocator>(create_local_cuda_memory_allocator());
   Allocator allocator = Allocator(tracked_allocator_ptr);
-  std::vector<tensor_guid_t> input_tensor_ids;
-
-  ComputationGraphBuilder cg_builder;
-  for (ParallelTensorShape const &input : inputs) {
-    TensorShape tensor_shape = get_piece_shape(input);
-    tensor_guid_t tensor_id =
-        cg_builder.create_input(tensor_shape, CreateGrad::YES);
-    input_tensor_ids.push_back(tensor_id);
-  }
 
-  auto get_vector_piece_attrs =
-      [](std::vector<ParallelTensorAttrs> const &parallel_attrs) {
-        return transform(parallel_attrs, [](ParallelTensorAttrs const &p) {
-          return get_piece_attrs(p);
-        });
-      };
+  LocalTrainingBacking local_backing(
+      allocator,
+      computation_graph,
+      LocalTensorBacking{},
+      LocalArgsBacking{this->runtime_arg_config});
 
-  // add operator to graph
-  LayerAddedResult layer_added_result =
-      cg_builder.add_layer_and_get_layer_added_result(
-          layer_attrs,
-          input_tensor_ids,
-          transform(get_vector_piece_attrs(weights),
-                    [&](TensorAttrs const &a) {
-                      return cg_builder.create_weight(a);
-                    }),
-          get_vector_piece_attrs(outputs));
+  allocate_all_computation_graph_tensors(local_backing.local_tensor_backing,
+                                         local_backing.gradient_tensor_source,
+                                         local_backing.computation_graph,
+                                         local_backing.allocator);
+
+  // execute layer
+  layer_guid_t operator_layer_guid =
+      get_layer_by_name(computation_graph, "operator");
+  execute_init(local_backing, operator_layer_guid);
+  float fwd = execute_forward(local_backing, operator_layer_guid).value();
+  float bwd = execute_backward(local_backing, operator_layer_guid).value();
 
-  LocalTrainingBacking local_backing(
-      allocator, cg_builder.computation_graph, this->runtime_arg_config);
-  local_backing.register_and_allocate_layer(layer_added_result.layer);
-  local_backing.execute_init(layer_added_result.layer);
-  float fwd = local_backing.execute_forward(layer_added_result.layer).value();
-  float bwd = local_backing.execute_backward(layer_added_result.layer).value();
   float total_execution_time = fwd + bwd;
 
   return CostDetails{total_execution_time,
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index 4893d9be88..144596820a 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -60,15 +60,17 @@ void execute_init(LocalTrainingBacking &local_training_backing,
         get_layer_attrs(local_training_backing.computation_graph, operator_node)
             .attrs;
 
-    TaskInvocation invocation =
-        lower_to_task_invocation(init(attrs),
-                                 operator_node,
-                                 local_training_backing.computation_graph,
-                                 std::nullopt);
+    TaskInvocation invocation = lower_to_task_invocation(
+        init(attrs),
+        operator_node,
+        local_training_backing.computation_graph,
+        local_training_backing.local_tensor_backing.tensor_gradient_mapping,
+        std::nullopt);
     TaskArgumentAccessor accessor =
         get_task_arg_accessor(local_training_backing.local_tensor_backing,
                               local_training_backing.local_args_backing,
-                              invocation);
+                              invocation,
+                              local_training_backing.allocator);
     DeviceSpecificDeviceStates device_state = call_init_task_impl(
         local_training_backing.task_registry, invocation.task_id, accessor);
     add_per_device_op_state(
@@ -89,11 +91,12 @@ std::optional<float>
     std::optional<DeviceSpecificDeviceStates> device_state =
         get_per_device_op_state_if_exists(
             local_training_backing.local_args_backing, operator_node);
-    TaskInvocation invocation =
-        lower_to_task_invocation(forward(attrs),
-                                 operator_node,
-                                 local_training_backing.computation_graph,
-                                 device_state);
+    TaskInvocation invocation = lower_to_task_invocation(
+        forward(attrs),
+        operator_node,
+        local_training_backing.computation_graph,
+        local_training_backing.local_tensor_backing.tensor_gradient_mapping,
+        device_state);
     TaskArgumentAccessor accessor =
         get_task_arg_accessor(local_training_backing.local_tensor_backing,
                               local_training_backing.local_args_backing,
@@ -106,18 +109,23 @@ std::optional<float>
   }
 }
 
-void compute_loss(LocalTrainingBacking const &local_training_backing,
+void compute_loss(LocalTrainingBacking &local_training_backing,
                   LossAttrs const &loss_attrs,
                   tensor_guid_t const &logit_tensor,
                   loss_tensor_t const &label_tensor) {
-  TaskInvocation loss_invocation =
-      backward(loss_attrs, logit_tensor, label_tensor);
+  TaskInvocation loss_invocation = backward(
+      loss_attrs,
+      logit_tensor,
+      local_training_backing.local_tensor_backing.tensor_gradient_mapping.at(
+          logit_tensor),
+      label_tensor);
   // TODO: https://github.com/flexflow/flexflow-train/issues/1442
   // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation));
   TaskArgumentAccessor loss_accessor =
       get_task_arg_accessor(local_training_backing.local_tensor_backing,
                             local_training_backing.local_args_backing,
-                            loss_invocation);
+                            loss_invocation,
+                            local_training_backing.allocator);
   TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl();
   loss_impl_fn.get<GenericTaskImplFunction>().function_ptr(loss_accessor);
 }
@@ -135,11 +143,12 @@ std::optional<float>
     std::optional<DeviceSpecificDeviceStates> device_state =
         get_per_device_op_state_if_exists(
             local_training_backing.local_args_backing, operator_node);
-    TaskInvocation invocation =
-        lower_to_task_invocation(backward(attrs),
-                                 operator_node,
-                                 local_training_backing.computation_graph,
-                                 device_state);
+    TaskInvocation invocation = lower_to_task_invocation(
+        backward(attrs),
+        operator_node,
+        local_training_backing.computation_graph,
+        local_training_backing.local_tensor_backing.tensor_gradient_mapping,
+        device_state);
     TaskArgumentAccessor accessor =
         get_task_arg_accessor(local_training_backing.local_tensor_backing,
                               local_training_backing.local_args_backing,
@@ -161,13 +170,19 @@ void execute_update(LocalTrainingBacking &local_training_backing,
     // get tensors
     tensor_guid_t weight_tensor = get_only(
         get_outgoing_tensors(local_training_backing.computation_graph, node));
+
+    gradient_tensor_t weight_grad_tensor =
+        local_training_backing.local_tensor_backing.tensor_gradient_mapping.at(
+            weight_tensor);
     std::vector<optimizer_tensor_t> optimizer_buffer_tensors =
         local_training_backing.local_tensor_backing.tensor_optimizer_mapping.at(
             weight_tensor);
 
     // get invocation
-    TaskInvocation invocation = get_update_invocation(
-        optimizer_attrs, weight_tensor, optimizer_buffer_tensors);
+    TaskInvocation invocation = get_update_invocation(optimizer_attrs,
+                                                      weight_tensor,
+                                                      weight_grad_tensor,
+                                                      optimizer_buffer_tensors);
 
     // TODO: https://github.com/flexflow/flexflow-train/issues/1442
     // assert(is_invocation_valid(get_update_signature(attrs), invocation));
diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc
index 0a89dfd9d5..93a792c466 100644
--- a/lib/local-execution/src/loss_functions.cc
+++ b/lib/local-execution/src/loss_functions.cc
@@ -20,25 +20,27 @@
 
 namespace FlexFlow {
 
-enum Slots { LOGIT, LABEL, ATTRS, PROFILING };
+enum Slots { LOGIT, LABEL, LOGIT_GRAD, ATTRS, PROFILING };
 
 TaskSignature get_loss_bwd_signature() {
   TaskSignature sig = make_empty_task_signature();
   add_slot(sig, LOGIT, TensorType::FORWARD);
   add_slot(sig, LABEL, TensorType::LOSS);
-  add_slot(sig, LOGIT, TensorType::GRADIENT);
+  add_slot(sig, LOGIT_GRAD, TensorType::GRADIENT);
 
   add_arg_slot<LossAttrs>(sig, ATTRS);
   add_arg_slot<ProfilingSettings>(sig, PROFILING);
   return sig;
 }
 
-TaskInvocation
-    backward(LossAttrs const &attrs, tensor_guid_t logit, loss_tensor_t label) {
+TaskInvocation backward(LossAttrs const &attrs,
+                        tensor_guid_t logit,
+                        gradient_tensor_t logit_grad,
+                        loss_tensor_t label) {
   TaskBinding b;
   b.bind(LOGIT, logit);
-  b.bind(LABEL, label);
-  b.bind_grad(LOGIT, logit);
+  b.bind_loss(LABEL, label);
+  b.bind_grad(LOGIT_GRAD, logit_grad);
 
   b.bind_arg(ATTRS, attrs);
   b.bind_arg(PROFILING, profiling_settings());
@@ -49,9 +51,9 @@ TaskInvocation
 static void backward_task_impl(TaskArgumentAccessor const &acc) {
   auto attrs = acc.get_argument<LossAttrs>(ATTRS);
   auto profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto logit_grad = acc.get_tensor_grad<Permissions::RW>(LOGIT);
+  auto logit_grad = acc.get_tensor_grad<Permissions::RW>(LOGIT_GRAD);
   auto logit = acc.get_tensor<Permissions::RO>(LOGIT);
-  auto label = acc.get_tensor<Permissions::RO>(LABEL);
+  auto label = acc.get_loss_tensor<Permissions::RO>(LABEL);
   int batch_size = logit.shape.at(legion_dim_t{1});
   // assuming logit shape is [batch dim, num classes]
 
diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc
index 6691bd3a03..98b8851212 100644
--- a/lib/local-execution/src/model_training_instance.cc
+++ b/lib/local-execution/src/model_training_instance.cc
@@ -7,7 +7,7 @@ namespace FlexFlow {
 
 ModelTrainingInstance::ModelTrainingInstance(
     LocalTrainingBacking const &local_training_backing,
-    tensor_guid_t const & logit_tensor,
+    tensor_guid_t const &logit_tensor,
     TensorShape const &label_tensor_shape,
     LossAttrs const &loss_attrs,
     OptimizerAttrs const &optimizer_attrs)
@@ -45,7 +45,7 @@ PerLayerElapsedTime forward(ModelTrainingInstance &model_training_instance) {
 
 PerLayerElapsedTime backward(ModelTrainingInstance &model_training_instance) {
   compute_loss(model_training_instance.training_backing,
-               model_training_instance.loss_attrs, 
+               model_training_instance.loss_attrs,
                model_training_instance.logit_tensor,
                model_training_instance.label_tensor);
 
@@ -59,13 +59,15 @@ PerLayerElapsedTime backward(ModelTrainingInstance &model_training_instance) {
   return per_layer_elapsed_time;
 }
 
-void update(ModelTrainingInstance & model_training_instance) {
-  for (layer_guid_t const &node :
-       topological_ordering(model_training_instance.training_backing.computation_graph)) {
-    execute_update(model_training_instance.training_backing, node, model_training_instance.optimizer_attrs);
+void update(ModelTrainingInstance &model_training_instance) {
+  for (layer_guid_t const &node : topological_ordering(
+           model_training_instance.training_backing.computation_graph)) {
+    execute_update(model_training_instance.training_backing,
+                   node,
+                   model_training_instance.optimizer_attrs);
   }
-  model_training_instance.optimizer_attrs =
-      get_optimizer_attrs_for_next_iter(model_training_instance.optimizer_attrs);
+  model_training_instance.optimizer_attrs = get_optimizer_attrs_for_next_iter(
+      model_training_instance.optimizer_attrs);
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/op_task_to_task_invocation.cc b/lib/local-execution/src/op_task_to_task_invocation.cc
index eb6dffabc4..0e04a2adec 100644
--- a/lib/local-execution/src/op_task_to_task_invocation.cc
+++ b/lib/local-execution/src/op_task_to_task_invocation.cc
@@ -8,6 +8,8 @@ TaskInvocation lower_to_task_invocation(
     OpTaskInvocation const &op_task_invocation,
     layer_guid_t const &layer_guid,
     ComputationGraph const &computation_graph,
+    std::unordered_map<tensor_guid_t, gradient_tensor_t> const
+        &tensor_gradient_mapping,
     std::optional<DeviceSpecificDeviceStates> const &device_states) {
   TaskBinding binding;
   // tensors
@@ -40,7 +42,8 @@ TaskInvocation lower_to_task_invocation(
     if (slot_grad_id.is_grad == IsGrad::NO) {
       binding.bind(slot_grad_id.slot_id, tensor_to_bind);
     } else if (slot_grad_id.is_grad == IsGrad::YES) {
-      binding.bind_grad(slot_grad_id.slot_id, tensor_to_bind);
+      binding.bind_grad(slot_grad_id.slot_id,
+                        tensor_gradient_mapping.at(tensor_to_bind));
     } else {
       throw mk_runtime_error(fmt::format("Invalid value for IsGrad {}",
                                          tensor_binding.first.is_grad));
diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc
index 94584dfc95..0c64147bd8 100644
--- a/lib/local-execution/src/optimizer.cc
+++ b/lib/local-execution/src/optimizer.cc
@@ -5,12 +5,21 @@
 
 namespace FlexFlow {
 
-enum Slots { ATTRS, WEIGHT, SGD_V, PROFILING, ADAM_M, ADAM_V, HANDLE };
+enum Slots {
+  ATTRS,
+  WEIGHT,
+  WEIGHT_GRAD,
+  SGD_V,
+  PROFILING,
+  ADAM_M,
+  ADAM_V,
+  HANDLE
+};
 
 TaskSignature get_sgd_update_signature() {
   TaskSignature sig = make_empty_task_signature();
   add_slot(sig, WEIGHT, TensorType::FORWARD);
-  add_slot(sig, WEIGHT, TensorType::GRADIENT);
+  add_slot(sig, WEIGHT_GRAD, TensorType::GRADIENT);
   add_slot(sig, SGD_V, TensorType::OPTIMIZER);
 
   add_arg_slot<SGDOptimizerAttrs>(sig, ATTRS);
@@ -23,13 +32,14 @@ TaskSignature get_sgd_update_signature() {
 
 TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs,
                           tensor_guid_t const &weight,
+                          gradient_tensor_t const &weight_grad,
                           optimizer_tensor_t const &sgd_v) {
   TaskBinding b;
   b.bind(WEIGHT, weight);
-  b.bind_grad(WEIGHT, weight);
+  b.bind_grad(WEIGHT_GRAD, weight_grad);
 
   if (attrs.momentum > 0.0f) {
-    b.bind(SGD_V, sgd_v);
+    b.bind_optimizer(SGD_V, sgd_v);
   }
   b.bind_arg(ATTRS, attrs);
   b.bind_arg(PROFILING, profiling_settings());
@@ -44,7 +54,7 @@ TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs,
 
 static void sgd_update_task_impl(TaskArgumentAccessor const &acc) {
   auto attrs = acc.get_argument<SGDOptimizerAttrs>(ATTRS);
-  auto weight_grad = acc.get_tensor_grad<Permissions::RO>(WEIGHT);
+  auto weight_grad = acc.get_tensor_grad<Permissions::RO>(WEIGHT_GRAD);
   auto weight = acc.get_tensor<Permissions::RW>(WEIGHT);
   auto profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
@@ -57,7 +67,7 @@ static void sgd_update_task_impl(TaskArgumentAccessor const &acc) {
 
   float *sgd_v_ptr;
   if (attrs.momentum > 0.0f) {
-    auto sgd_v = acc.get_tensor<Permissions::RW>(SGD_V);
+    auto sgd_v = acc.get_optimizer_tensor<Permissions::RW>(SGD_V);
     assert(sgd_v.shape == weight.shape);
     sgd_v_ptr = sgd_v.get_float_ptr();
   }
@@ -100,7 +110,7 @@ TaskImplFunction get_sgd_update_task_impl() {
 TaskSignature get_adam_update_signature() {
   TaskSignature sig = make_empty_task_signature();
   add_slot(sig, WEIGHT, TensorType::FORWARD);
-  add_slot(sig, WEIGHT, TensorType::GRADIENT);
+  add_slot(sig, WEIGHT_GRAD, TensorType::GRADIENT);
   add_slot(sig, ADAM_V, TensorType::OPTIMIZER);
   add_slot(sig, ADAM_M, TensorType::OPTIMIZER);
 
@@ -114,13 +124,14 @@ TaskSignature get_adam_update_signature() {
 
 TaskInvocation adam_update(AdamOptimizerAttrs const &attrs,
                            tensor_guid_t const &weight,
+                           gradient_tensor_t const &weight_grad,
                            optimizer_tensor_t const &adam_v,
                            optimizer_tensor_t const &adam_m) {
   TaskBinding b;
   b.bind(WEIGHT, weight);
-  b.bind_grad(WEIGHT, weight);
-  b.bind(ADAM_M, adam_m);
-  b.bind(ADAM_V, adam_v);
+  b.bind_grad(WEIGHT_GRAD, weight_grad);
+  b.bind_optimizer(ADAM_M, adam_m);
+  b.bind_optimizer(ADAM_V, adam_v);
   b.bind_arg(ATTRS, attrs);
   b.bind_arg(PROFILING, profiling_settings());
 
@@ -134,10 +145,10 @@ TaskInvocation adam_update(AdamOptimizerAttrs const &attrs,
 
 static void adam_update_task_impl(TaskArgumentAccessor const &acc) {
   auto attrs = acc.get_argument<AdamOptimizerAttrs>(ATTRS);
-  auto weight_grad = acc.get_tensor_grad<Permissions::RO>(WEIGHT);
+  auto weight_grad = acc.get_tensor_grad<Permissions::RO>(WEIGHT_GRAD);
   auto weight = acc.get_tensor<Permissions::RW>(WEIGHT);
-  auto v_tensor = acc.get_tensor<Permissions::RW>(ADAM_V);
-  auto m_tensor = acc.get_tensor<Permissions::RW>(ADAM_M);
+  auto v_tensor = acc.get_optimizer_tensor<Permissions::RW>(ADAM_V);
+  auto m_tensor = acc.get_optimizer_tensor<Permissions::RW>(ADAM_M);
 
   auto profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
@@ -195,14 +206,18 @@ TaskSignature get_update_signature(OptimizerAttrs const &attrs) {
 TaskInvocation get_update_invocation(
     OptimizerAttrs const &attrs,
     tensor_guid_t const &weight,
+    gradient_tensor_t const &weight_grad,
     std::vector<optimizer_tensor_t> const &grad_buffer_tensors) {
   return attrs.visit<TaskInvocation>(overload{
       [&](SGDOptimizerAttrs const &s) {
-        return sgd_update(s, weight, grad_buffer_tensors.at(0));
+        return sgd_update(s, weight, weight_grad, grad_buffer_tensors.at(0));
       },
       [&](AdamOptimizerAttrs const &s) {
-        return adam_update(
-            s, weight, grad_buffer_tensors.at(0), grad_buffer_tensors.at(1));
+        return adam_update(s,
+                           weight,
+                           weight_grad,
+                           grad_buffer_tensors.at(0),
+                           grad_buffer_tensors.at(1));
       }});
 }
 
diff --git a/lib/local-execution/src/task_binding.cc b/lib/local-execution/src/task_binding.cc
index f0aac85ea1..7684511488 100644
--- a/lib/local-execution/src/task_binding.cc
+++ b/lib/local-execution/src/task_binding.cc
@@ -15,29 +15,30 @@ void TaskBinding::bind(slot_id_t name, tensor_guid_t const &binding) {
                                 TensorTypeVariant{binding}});
 }
 
-void TaskBinding::bind_grad(int name, tensor_guid_t const &binding) {
-  this->bind(slot_id_t{name}, binding);
+void TaskBinding::bind_grad(int name, gradient_tensor_t const &binding) {
+  this->bind_grad(slot_id_t{name}, binding);
 }
 
-void TaskBinding::bind_grad(slot_id_t name, tensor_guid_t const &binding) {
+void TaskBinding::bind_grad(slot_id_t name, gradient_tensor_t const &binding) {
   this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::GRADIENT},
                                 TensorTypeVariant{binding}});
 }
 
-void TaskBinding::bind(int name, optimizer_tensor_t const &binding) {
-  this->bind(slot_id_t{name}, binding);
+void TaskBinding::bind_optimizer(int name, optimizer_tensor_t const &binding) {
+  this->bind_optimizer(slot_id_t{name}, binding);
 }
 
-void TaskBinding::bind(slot_id_t name, optimizer_tensor_t const &binding) {
+void TaskBinding::bind_optimizer(slot_id_t name,
+                                 optimizer_tensor_t const &binding) {
   this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::OPTIMIZER},
                                 TensorTypeVariant{binding}});
 }
 
-void TaskBinding::bind(int name, loss_tensor_t const &binding) {
-  this->bind(slot_id_t{name}, binding);
+void TaskBinding::bind_loss(int name, loss_tensor_t const &binding) {
+  this->bind_loss(slot_id_t{name}, binding);
 }
 
-void TaskBinding::bind(slot_id_t name, loss_tensor_t const &binding) {
+void TaskBinding::bind_loss(slot_id_t name, loss_tensor_t const &binding) {
   this->tensor_bindings.insert(
       {SlotTensorTypeId{name, TensorType::LOSS}, TensorTypeVariant{binding}});
 }
diff --git a/lib/pcg/include/pcg/computation_graph_builder.h b/lib/pcg/include/pcg/computation_graph_builder.h
index 585399ea1d..41c4ff5b5c 100644
--- a/lib/pcg/include/pcg/computation_graph_builder.h
+++ b/lib/pcg/include/pcg/computation_graph_builder.h
@@ -257,12 +257,6 @@ struct ComputationGraphBuilder {
   std::vector<tensor_guid_t> get_outputs(LayerAttrs const &) const;
   tensor_guid_t get_output(LayerAttrs const &, int idx) const;
 
-  LayerAddedResult add_layer_and_get_layer_added_result(
-      LayerAttrs const &layer,
-      std::vector<tensor_guid_t> const &inputs,
-      std::vector<tensor_guid_t> const &weights,
-      std::vector<TensorAttrs> const &outputs);
-
   std::vector<tensor_guid_t>
       add_layer(LayerAttrs const &layer,
                 std::vector<tensor_guid_t> const &inputs,
diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h
index 35113553f2..019b120936 100644
--- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h
+++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h
@@ -179,10 +179,6 @@ struct ParallelComputationGraphBuilder {
   ParallelComputationGraph pcg;
 };
 
-ParallelTensorAttrs
-    make_weight_attrs(ParallelTensorShape const &shape,
-                      std::optional<InitializerAttrs> const &initializer_attrs);
-
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/pcg/src/pcg/computation_graph_builder.cc b/lib/pcg/src/pcg/computation_graph_builder.cc
index 4c619288cb..4a565476bd 100644
--- a/lib/pcg/src/pcg/computation_graph_builder.cc
+++ b/lib/pcg/src/pcg/computation_graph_builder.cc
@@ -106,7 +106,7 @@ static void check_incoming_tensor_roles(LayerAttrs const &layer,
   }
 }
 
-LayerAddedResult ComputationGraphBuilder::add_layer_and_get_layer_added_result(
+std::vector<tensor_guid_t> ComputationGraphBuilder::add_layer(
     LayerAttrs const &layer,
     std::vector<tensor_guid_t> const &inputs,
     std::vector<tensor_guid_t> const &weights,
@@ -115,17 +115,7 @@ LayerAddedResult ComputationGraphBuilder::add_layer_and_get_layer_added_result(
 
   LayerAddedResult added = ::FlexFlow::add_layer(
       this->computation_graph, layer, concat_vectors(inputs, weights), outputs);
-  return added;
-}
-
-std::vector<tensor_guid_t> ComputationGraphBuilder::add_layer(
-    LayerAttrs const &layer,
-    std::vector<tensor_guid_t> const &inputs,
-    std::vector<tensor_guid_t> const &weights,
-    std::vector<TensorAttrs> const &outputs) {
-  return this
-      ->add_layer_and_get_layer_added_result(layer, inputs, weights, outputs)
-      .outputs;
+  return added.outputs;
 }
 
 tensor_guid_t ComputationGraphBuilder::as_type(tensor_guid_t const &x,
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
index b56156fe8a..ce00ea62f4 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
@@ -20,7 +20,7 @@ static std::string get_default_name(PCGOperatorAttrs const &attrs) {
   return get_default_name(get_op_type(attrs));
 }
 
-ParallelTensorAttrs make_weight_attrs(
+static ParallelTensorAttrs make_weight_attrs(
     ParallelTensorShape const &shape,
     std::optional<InitializerAttrs> const &initializer_attrs) {
   return ParallelTensorAttrs{

From 6f689a472be91eed310c48217004754f315aab94 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 4 Feb 2025 17:46:01 -0800
Subject: [PATCH 34/91] feat: add Future wrapper for func result

---
 .../include/realm-backend/task_result.h       | 103 ++++++++++++++++++
 .../include/realm-backend/task_wrapper.h      |   6 +-
 lib/realm-backend/src/task_result.cc          |  35 ++++++
 lib/realm-backend/src/task_wrapper.cc         |  28 ++---
 4 files changed, 155 insertions(+), 17 deletions(-)
 create mode 100644 lib/realm-backend/include/realm-backend/task_result.h
 create mode 100644 lib/realm-backend/src/task_result.cc

diff --git a/lib/realm-backend/include/realm-backend/task_result.h b/lib/realm-backend/include/realm-backend/task_result.h
new file mode 100644
index 0000000000..5fb158496e
--- /dev/null
+++ b/lib/realm-backend/include/realm-backend/task_result.h
@@ -0,0 +1,103 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_RESULT_H
+#define _FLEXFLOW_LOCAL_EXECUTION_TASK_RESULT_H
+
+#include "realm-backend/driver.h"
+#include <cassert>
+
+namespace FlexFlow {
+
+/**
+ * @brief SharedState class template that holds the state for both the Promise
+ * and Future objects. It is responsible for storing the result value and
+ * synchronization between the producer (Promise) and consumer (Future).
+ */
+template <typename T> struct SharedState {
+  // synchronization primitives
+  Realm::Event event = Realm::Event::NO_EVENT;
+  // where the result is stored
+  Realm::RegionInstance inst;
+
+  SharedState() = delete;
+  SharedState(Realm::Memory);
+  void set_event(Realm::Event);
+  void set_value(T &&);
+  void wait();
+  T get_value();
+};
+
+// Specialization of SharedState for the `void` type, as it does not carry a
+// value.
+template <> struct SharedState<void> {
+  // synchronization primitives
+  Realm::Event event = Realm::Event::NO_EVENT;
+
+  SharedState() = default;
+  void set_event(Realm::Event);
+  void wait();
+};
+
+/**
+ * @brief Future class template that allows retrieving the result from a
+ * SharedState object. It is used to access the value once the Promise has been
+ * fulfilled, and provides mechanisms to block the current thread until the
+ * result is available.
+ */
+template <typename T> class Future {
+public:
+  explicit Future(std::shared_ptr<SharedState<T>> state)
+      : state_(std::move(state)) {}
+  explicit Future(T value) : value_(std::move(value)) {}
+  void set_event(Realm::Event e) { state_->set_event(e); }
+  T get() {
+    value_ = state_->get_value();
+    return value_;
+  }
+  void wait() { state_->wait(); }
+
+private:
+  std::shared_ptr<SharedState<T>> state_;
+  T value_;
+};
+
+// Specialization of Future for the `void` type, as it does not carry a value.
+template <> class Future<void> {
+public:
+  explicit Future(std::shared_ptr<SharedState<void>> state)
+      : state_(std::move(state)) {}
+  explicit Future() = default;
+  void set_event(Realm::Event e) { state_->set_event(e); }
+  void wait() { state_->wait(); }
+
+private:
+  std::shared_ptr<SharedState<void>> state_;
+};
+
+/**
+ * @brief Promise class template that allows setting a result in a SharedState
+ * object. It is used to fulfill a Future with a value, and provides methods to
+ * notify the waiting Future of completion.
+ */
+template <typename T> class Promise {
+public:
+  Promise() = delete;
+  Promise(Realm::Memory mem) : state_(std::make_shared<SharedState<T>>(mem)) {}
+  Future<T> get_future() { return Future<T>(state_); }
+  void set_value(T &&value) const { state_->set_value(std::move(value)); }
+
+private:
+  std::shared_ptr<SharedState<T>> state_;
+};
+
+// Specialization of Promise for the `void` type, as it does not carry a value.
+template <> class Promise<void> {
+public:
+  Promise() : state_(std::make_shared<SharedState<void>>()) {}
+  Future<void> get_future() { return Future<void>(state_); }
+
+private:
+  std::shared_ptr<SharedState<void>> state_;
+};
+
+} // namespace FlexFlow
+
+#endif
\ No newline at end of file
diff --git a/lib/realm-backend/include/realm-backend/task_wrapper.h b/lib/realm-backend/include/realm-backend/task_wrapper.h
index bf53ca7e93..89521becf4 100644
--- a/lib/realm-backend/include/realm-backend/task_wrapper.h
+++ b/lib/realm-backend/include/realm-backend/task_wrapper.h
@@ -2,18 +2,18 @@
 #define _FLEXFLOW_REALM_BACKEND_TASK_WRAPPER_H
 
 #include "local-execution/task_registry.h"
-#include "realm-backend/driver.h"
 #include "realm-backend/realm_task_argument_accessor.h"
+#include "realm-backend/task_result.h"
 
 namespace FlexFlow {
 
 /* The following are general task wrappers to be invoked by the Realm runtime */
 
-struct RealmTaskArgs {
+template <typename T> struct RealmTaskArgs {
   task_id_t task_id;
   TaskImplFunction impl_function;
   TaskArgumentAccessor accessor;
-  void *result;
+  Promise<T> promise;
 };
 
 void init_wrapper_task(const void *args, size_t arglen, const void *userdata,
diff --git a/lib/realm-backend/src/task_result.cc b/lib/realm-backend/src/task_result.cc
new file mode 100644
index 0000000000..05aa1a8a9c
--- /dev/null
+++ b/lib/realm-backend/src/task_result.cc
@@ -0,0 +1,35 @@
+#include "realm-backend/task_result.h"
+
+namespace FlexFlow {
+
+/************ SharedState implementation ************/
+template <typename T> SharedState<T>::SharedState(Realm::Memory mem) {
+  Realm::Rect<1> bounds(Realm::Point<1>(0), Realm::Point<1>(0));
+  this->inst = Realm::RegionInstance::NO_INST;
+  Realm::RegionInstance::create_instance(
+      this->inst, mem, bounds, {sizeof(T)}, /*SOA*/ 1,
+      Realm::ProfilingRequestSet(), Realm::Event::NO_EVENT)
+      .wait();
+}
+
+template <typename T> void SharedState<T>::set_event(Realm::Event e) {
+  this->event = e;
+}
+
+template <typename T> void SharedState<T>::set_value(T &&value) {
+  Realm::GenericAccessor<T, 1> acc(this->inst, 0);
+  acc[Realm::Point<1>(0)] = std::move(value);
+}
+
+template <typename T> void SharedState<T>::wait() { this->event.wait(); }
+
+template <typename T> T SharedState<T>::get_value() {
+  wait();
+  Realm::GenericAccessor<T, 1> acc(this->inst, 0);
+  return acc[Realm::Point<1>(0)];
+}
+
+void SharedState<void>::set_event(Realm::Event e) { this->event = e; }
+
+void SharedState<void>::wait() { this->event.wait(); }
+} // namespace FlexFlow
\ No newline at end of file
diff --git a/lib/realm-backend/src/task_wrapper.cc b/lib/realm-backend/src/task_wrapper.cc
index 7361a24cd9..e58d2611af 100644
--- a/lib/realm-backend/src/task_wrapper.cc
+++ b/lib/realm-backend/src/task_wrapper.cc
@@ -6,31 +6,31 @@ using namespace Realm;
 
 void init_wrapper_task(const void *args, size_t arglen, const void *userdata,
                        size_t userlen, Processor p) {
-  RealmTaskArgs const &task_args =
-      *reinterpret_cast<const RealmTaskArgs *>(args);
+  RealmTaskArgs<DeviceSpecificDeviceStates> const &task_args =
+      *reinterpret_cast<const RealmTaskArgs<DeviceSpecificDeviceStates> *>(args);
   auto fn =
-      RealmTaskArgs.impl_function.get<InitOpTaskImplFunction>().function_ptr;
-  *reinterpret_cast<DeviceSpecificDeviceStates *>(RealmTaskArgs.result) =
-      fn(RealmTaskArgs.acc);
+      task_args.impl_function.get<InitOpTaskImplFunction>().function_ptr;
+  DeviceSpecificDeviceStates result = fn(task_args.accessor);
+  task_args.promise.set_value(std::move(result));
 }
 
 void fwdbwd_wrapper_task(const void *args, size_t arglen, const void *userdata,
                          size_t userlen, Processor p) {
-  RealmTaskArgs const &task_args =
-      *reinterpret_cast<const RealmTaskArgs *>(args);
+  RealmTaskArgs<std::optional<float>> const &task_args =
+      *reinterpret_cast<const RealmTaskArgs<std::optional<float>> *>(args);
   auto fn =
-      RealmTaskArgs.impl_function.get<FwdBwdOpTaskImplFunction>().function_ptr;
-  *reinterpret_cast<std::optional<float> *>(RealmTaskArgs.result) =
-      fn(RealmTaskArgs.acc);
+      task_args.impl_function.get<FwdBwdOpTaskImplFunction>().function_ptr;
+  std::optional<float> result = fn(task_args.accessor);
+  task_args.promise.set_value(std::move(result));
 }
 
 void generic_wrapper_task(const void *args, size_t arglen, const void *userdata,
                           size_t userlen, Processor p) {
-  RealmTaskArgs const &task_args =
-      *reinterpret_cast<const RealmTaskArgs *>(args);
+  RealmTaskArgs<void> const &task_args =
+      *reinterpret_cast<const RealmTaskArgs<void> *>(args);
   auto fn =
-      RealmTaskArgs.impl_function.get<GenericTaskImplFunction>().function_ptr;
-  fn(RealmTaskArgs.acc);
+      task_args.impl_function.get<GenericTaskImplFunction>().function_ptr;
+  fn(task_args.accessor);
 }
 
 void register_wrapper_tasks_init(Processor p, task_id_t task_id) {

From fe2bc2160e172afaef4bc8f0c4a09a77a55b9763 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 4 Feb 2025 17:48:38 -0800
Subject: [PATCH 35/91] feat: add realm-backend draft impl

---
 .../realm-backend/model_training_instance.h   |  37 +++
 .../include/realm-backend/realm_allocator.h   |  59 ++++
 .../realm-backend/realm_args_backing.h        |  15 +-
 .../realm_task_argument_accessor.h            |  10 +-
 .../realm-backend/realm_tensor_backing.h      |  22 +-
 .../realm-backend/realm_training_backing.h    |  62 ++--
 .../src/model_training_instance.cc            |  87 +++++
 lib/realm-backend/src/realm_allocator.cc      |  54 +++
 lib/realm-backend/src/realm_args_backing.cc   |  65 ++++
 .../src/realm_task_argument_accessor.cc       |  95 ++++++
 lib/realm-backend/src/realm_tensor_backing.cc | 127 ++++++++
 .../src/realm_training_backing.cc             | 307 +++++++++++-------
 12 files changed, 778 insertions(+), 162 deletions(-)
 create mode 100644 lib/realm-backend/include/realm-backend/model_training_instance.h
 create mode 100644 lib/realm-backend/include/realm-backend/realm_allocator.h
 create mode 100644 lib/realm-backend/src/model_training_instance.cc
 create mode 100644 lib/realm-backend/src/realm_allocator.cc
 create mode 100644 lib/realm-backend/src/realm_args_backing.cc
 create mode 100644 lib/realm-backend/src/realm_task_argument_accessor.cc
 create mode 100644 lib/realm-backend/src/realm_tensor_backing.cc

diff --git a/lib/realm-backend/include/realm-backend/model_training_instance.h b/lib/realm-backend/include/realm-backend/model_training_instance.h
new file mode 100644
index 0000000000..aa3876fb0d
--- /dev/null
+++ b/lib/realm-backend/include/realm-backend/model_training_instance.h
@@ -0,0 +1,37 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_MODEL_TRAINING_INSTANCE_H
+#define _FLEXFLOW_LOCAL_EXECUTION_MODEL_TRAINING_INSTANCE_H
+
+#include "realm-backend/realm_training_backing.h"
+#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
+#include "pcg/tensor_guid_t.dtg.h"
+#include "local-execution/loss_tensor_t.dtg.h"
+
+namespace FlexFlow {
+
+using PerLayerElapsedTime =
+    std::unordered_map<layer_guid_t, std::optional<float>>;
+
+struct ModelTrainingInstance {
+  ModelTrainingInstance(ComputationGraph const &,
+                        RuntimeArgConfig const &,
+                        LossAttrs const &,
+                        tensor_guid_t const &logit_tensor,
+                        loss_tensor_t const &label_tensor,
+                        OptimizerAttrs const &);
+
+  void execute_init();
+  PerLayerElapsedTime execute_forward();
+  PerLayerElapsedTime execute_backward();
+  void execute_update();
+
+  ComputationGraph computation_graph;
+  RealmTrainingBacking training_backing;
+  LossAttrs loss_attrs;
+  tensor_guid_t logit_tensor;
+  loss_tensor_t label_tensor;
+  OptimizerAttrs optimizer_attrs;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-backend/include/realm-backend/realm_allocator.h b/lib/realm-backend/include/realm-backend/realm_allocator.h
new file mode 100644
index 0000000000..1e0c7b23c4
--- /dev/null
+++ b/lib/realm-backend/include/realm-backend/realm_allocator.h
@@ -0,0 +1,59 @@
+#ifndef _FLEXFLOW_REALM_BACKEND_REALM_ALLOCATOR_H
+#define _FLEXFLOW_REALM_BACKEND_REALM_ALLOCATOR_H
+
+#include "realm-backend/driver.h"
+#include "realm.h"
+#include <realm/event.h>
+
+namespace FlexFlow {
+
+struct RealmAllocatorImpl;
+
+struct RealmRegion {
+  Realm::RegionInstance instance;
+  RealmAllocatorImpl *allocator;
+};
+
+struct RealmAllocatorImpl {
+  RealmAllocatorImpl() = delete;
+  RealmAllocatorImpl(RealmAllocatorImpl const &) = delete;
+  RealmAllocatorImpl(RealmAllocatorImpl &&) = delete;
+  RealmAllocatorImpl(Realm::Processor);
+  ~RealmAllocatorImpl() = default;
+
+  RealmRegion allocate(size_t);
+  void deallocate(RealmRegion);
+
+private:
+  std::unordered_map<Realm::RegionInstance, void *> ptrs;
+  Realm::Processor proc;
+  Realm::Memory mem;
+  std::vector<size_t> field_sizes = {sizeof(char)};
+};
+
+struct RealmAllocator {
+  RealmAllocator() = delete;
+
+  RealmRegion allocate(size_t);
+  void deallocate(RealmRegion);
+
+  template <typename T, typename... Args>
+  static typename std::enable_if<std::is_base_of<RealmAllocatorImpl, T>::value,
+                                 RealmAllocator>::type
+  create(Args &&...args) {
+    return RealmAllocator(std::make_shared<T>(std::forward<Args>(args)...));
+  }
+
+  RealmAllocator(std::shared_ptr<RealmAllocatorImpl> ptr) : i_allocator(ptr) {};
+  RealmAllocator(RealmAllocator const &allocator)
+      : i_allocator(allocator.i_allocator) {};
+
+private:
+  std::shared_ptr<RealmAllocatorImpl> i_allocator;
+};
+
+RealmAllocator create_realm_memory_allocator(Realm::Processor);
+
+} // namespace FlexFlow
+
+#endif
\ No newline at end of file
diff --git a/lib/realm-backend/include/realm-backend/realm_args_backing.h b/lib/realm-backend/include/realm-backend/realm_args_backing.h
index 626698cba6..88db880fcb 100644
--- a/lib/realm-backend/include/realm-backend/realm_args_backing.h
+++ b/lib/realm-backend/include/realm-backend/realm_args_backing.h
@@ -1,22 +1,23 @@
 #ifndef _FLEXFLOW_REALM_BACKEND_REALM_ARGS_BACKING_H
 #define _FLEXFLOW_REALM_BACKEND_REALM_ARGS_BACKING_H
 
-#include "pcg/layer_guid_t.dtg.h"
-#include "pcg/computation_graph.h"
-#include "local-execution/per_device_op_state.h"
 #include "local-execution/op_task_invocation.h"
+#include "local-execution/per_device_op_state.h"
 #include "local-execution/runtime_arg_config.h"
 #include "local-execution/task_invocation.dtg.h"
+#include "pcg/computation_graph.h"
+#include "pcg/layer_guid_t.dtg.h"
 #include "realm-backend/realm_task_argument_accessor.h"
+#include "realm-backend/task_result.h"
 
 namespace FlexFlow {
 
-struct LocalArgsBacking {
-  LocalArgsBacking(RuntimeArgConfig const &);
+struct RealmArgsBacking {
+  RealmArgsBacking(RuntimeArgConfig const &);
 
 public:
   void add_per_device_op_state(layer_guid_t const &,
-                               DeviceSpecificDeviceStates const &);
+                               Future<DeviceSpecificDeviceStates> &&);
 
   ArgSlotsBacking construct_arg_slots_backing(TaskBinding const &) const;
 
@@ -32,6 +33,6 @@ struct LocalArgsBacking {
   RuntimeArgConfig runtime_arg_config;
 };
 
-}
+} // namespace FlexFlow
 
 #endif
diff --git a/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h b/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h
index ca4bc9db02..5c7ecafd0f 100644
--- a/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h
+++ b/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h
@@ -13,13 +13,13 @@ using TensorSlotsBacking = std::unordered_map<
     std::variant<GenericTensorAccessorW, std::vector<GenericTensorAccessorW>>>;
 using ArgSlotsBacking = std::unordered_map<slot_id_t, ConcreteArgSpec>;
 
-struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor {
-  LocalTaskArgumentAccessor(Allocator const &allocator,
+struct RealmTaskArgumentAccessor : public ITaskArgumentAccessor {
+  RealmTaskArgumentAccessor(Allocator const &allocator,
                             TensorSlotsBacking const &tensor_slots_backing,
                             ArgSlotsBacking const &arg_slots_backing);
 
-  LocalTaskArgumentAccessor(LocalTaskArgumentAccessor const &) = delete;
-  LocalTaskArgumentAccessor(LocalTaskArgumentAccessor &&) = delete;
+  RealmTaskArgumentAccessor(RealmTaskArgumentAccessor const &) = delete;
+  RealmTaskArgumentAccessor(RealmTaskArgumentAccessor &&) = delete;
 
   ConcreteArgSpec const &get_concrete_arg(slot_id_t) const override;
 
@@ -48,7 +48,7 @@ TensorSlotsBackingWithoutAddresses
     get_slots_backing_without_tensor_allocation_addresses(
         TensorSlotsBacking const &);
 
-CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalTaskArgumentAccessor);
+CHECK_RC_COPY_VIRTUAL_COMPLIANT(RealmTaskArgumentAccessor);
 
 } // namespace FlexFlow
 
diff --git a/lib/realm-backend/include/realm-backend/realm_tensor_backing.h b/lib/realm-backend/include/realm-backend/realm_tensor_backing.h
index 2d9fa0bbdf..d9df0dfcb1 100644
--- a/lib/realm-backend/include/realm-backend/realm_tensor_backing.h
+++ b/lib/realm-backend/include/realm-backend/realm_tensor_backing.h
@@ -4,6 +4,7 @@
 
 #include "kernels/accessor.h"
 #include "realm-backend/realm_task_argument_accessor.h"
+#include "realm-backend/realm_allocator.h"
 #include "local-execution/task_invocation.dtg.h"
 #include "local-execution/tensor_role.dtg.h"
 #include "local-execution/lowered_tensor_t.dtg.h"
@@ -16,23 +17,25 @@
 
 namespace FlexFlow {
 
-using TensorBackingMap =
-    std::unordered_map<lowered_tensor_t, GenericTensorAccessorW>;
+using TensorRegionMap =
+    std::unordered_map<lowered_tensor_t, RealmRegion>;
+using TensorShapeMap =
+    std::unordered_map<lowered_tensor_t, TensorShape>;
 
-struct LocalTensorBacking {
-  LocalTensorBacking();
+struct RealmTensorBacking {
+  RealmTensorBacking();
 
 public:
   void allocate_layer_tensors(layer_guid_t const &,
                               ComputationGraph const &,
-                              Allocator &);
+                              RealmAllocator &);
   void allocate_tensors_by_role(TensorRole const &,
                                 layer_guid_t const &,
                                 ComputationGraph const &,
-                                Allocator &);
+                                RealmAllocator &);
   void allocate_optimizer_tensors(tensor_guid_t const &,
                                   std::vector<optimizer_tensor_t> const &,
-                                  Allocator &);
+                                  RealmAllocator &);
   TensorSlotsBacking
       construct_tensor_slots_backing(TaskBinding const &) const;
 
@@ -43,13 +46,12 @@ struct LocalTensorBacking {
 
 public:
   // tensors
-  TensorBackingMap tensor_backings;
-  
+  TensorRegionMap tensor_regions;
+  TensorShapeMap tensor_shapes;
   std::unordered_map<tensor_guid_t, lowered_tensor_t> tensor_lowering_mapping;
   std::unordered_map<tensor_guid_t, lowered_tensor_t> gradient_tensor_lowering_mapping;
   std::unordered_map<optimizer_tensor_t, lowered_tensor_t> optimizer_tensor_lowering_mapping;
   std::unordered_map<loss_tensor_t, lowered_tensor_t> loss_tensor_lowering_mapping;
-
   LoweredTensorSource lowered_tensor_source;
 };
 
diff --git a/lib/realm-backend/include/realm-backend/realm_training_backing.h b/lib/realm-backend/include/realm-backend/realm_training_backing.h
index e5385a93c3..ddd3bb7ed1 100644
--- a/lib/realm-backend/include/realm-backend/realm_training_backing.h
+++ b/lib/realm-backend/include/realm-backend/realm_training_backing.h
@@ -1,56 +1,62 @@
 #ifndef _FLEXFLOW_REALM_BACKEND_REALM_TRAINING_BACKING_H
 #define _FLEXFLOW_REALM_BACKEND_REALM_TRAINING_BACKING_H
 
-#include "realm-backend/realm_tensor_backing.h"
-#include "realm-backend/realm_args_backing.h"
+#include "local-execution/optimizer_tensor_source.h"
 #include "local-execution/task_registry.h"
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
 #include "pcg/computation_graph.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
-#include "local-execution/optimizer_tensor_source.h"
+#include "realm-backend/driver.h"
+#include "realm-backend/realm_allocator.h"
+#include "realm-backend/realm_args_backing.h"
+#include "realm-backend/realm_tensor_backing.h"
+#include "realm-backend/task_wrapper.h"
 
 namespace FlexFlow {
 
 using PerLayerElapsedTime =
     std::unordered_map<layer_guid_t, std::optional<float>>;
 
-struct LocalTrainingBacking {
-  LocalTrainingBacking(Allocator const &,
-                       ComputationGraph const &,
-                       RuntimeArgConfig const &);
+struct RealmTrainingBacking {
+  RealmTrainingBacking(ComputationGraph const &, RuntimeArgConfig const &,
+                       Realm::Processor);
   void register_and_allocate_layer(layer_guid_t const &);
   void allocate_layer_optimizer_tensors(layer_guid_t const &,
                                         OptimizerAttrs const &);
 
   void execute_init(layer_guid_t const &);
-  std::optional<float> execute_forward(layer_guid_t const &);
-  void compute_loss(LossAttrs const &loss_attrs,
-                    tensor_guid_t const &logit_tensor,
-                    loss_tensor_t const &label_tensor);
-  std::optional<float> execute_backward(layer_guid_t const &);
-  void execute_update(layer_guid_t const &, OptimizerAttrs const &);
-
-  TaskArgumentAccessor
-      get_task_arg_accessor(TaskInvocation const &) const;
+  Future<std::optional<float>> execute_forward(layer_guid_t const &);
+  Future<std::optional<float>> execute_backward(layer_guid_t const &);
+  Future<void> execute_update(layer_guid_t const &, OptimizerAttrs const &);
+  Future<void> compute_loss(LossAttrs const &loss_attrs,
+                            tensor_guid_t const &logit_tensor,
+                            loss_tensor_t const &label_tensor);
 
-  TaskInvocation lower_to_task_invocation(OpTaskInvocation const &, layer_guid_t const &) const;
+  TaskArgumentAccessor get_task_arg_accessor(TaskInvocation const &) const;
 
-  LocalTensorBacking local_tensor_backing;
-  LocalArgsBacking local_args_backing;
+  TaskInvocation lower_to_task_invocation(OpTaskInvocation const &,
+                                          layer_guid_t const &) const;
 
-private:
-  DeviceSpecificDeviceStates call_init_task_impl(task_id_t,
-                                                 TaskArgumentAccessor const &);
-  std::optional<float> call_task_impl(task_id_t, TaskArgumentAccessor);
-
-private:
-  Allocator allocator;
   ComputationGraph computation_graph;
   TaskRegistry task_registry;
 
-  // optimizer
+  // runtime
+  Realm::Processor master_proc;
+  Realm::Memory master_mem;
+  std::vector<Realm::Processor> worker_procs;
+  std::unordered_map<Realm::Processor, Realm::Event> proc_events;
+  std::vector<RealmAllocator> allocators;
+
+  // storage
+  RealmTensorBacking realm_tensor_backing;
+  RealmArgsBacking realm_args_backing;
   OptimizerTensorSource optimizer_tensor_source;
-  std::unordered_map<layer_guid_t, std::vector<optimizer_tensor_t>> layer_optimizer_tensor_ids;
+  std::unordered_map<layer_guid_t, std::vector<optimizer_tensor_t>>
+      layer_optimizer_tensor_ids;
+
+private:
+  std::optional<float> call_task_impl(task_id_t, TaskSignatureAndImpl,
+                                      TaskArgumentAccessor);
 };
 
 } // namespace FlexFlow
diff --git a/lib/realm-backend/src/model_training_instance.cc b/lib/realm-backend/src/model_training_instance.cc
new file mode 100644
index 0000000000..f9c959c389
--- /dev/null
+++ b/lib/realm-backend/src/model_training_instance.cc
@@ -0,0 +1,87 @@
+#include "pcg/computation_graph.h"
+#include "pcg/optimizer_attrs.h"
+#include "realm-backend/model_training_instance.h"
+#include "utils/containers/reversed.h"
+
+namespace FlexFlow {
+
+ModelTrainingInstance::ModelTrainingInstance(
+    ComputationGraph const &computation_graph,
+    RuntimeArgConfig const &runtime_arg_config, LossAttrs const &loss_attrs,
+    tensor_guid_t const &logit_tensor, loss_tensor_t const &label_tensor,
+    OptimizerAttrs const &optimizer_attrs)
+    : computation_graph(computation_graph),
+      training_backing(computation_graph, runtime_arg_config),
+      loss_attrs(loss_attrs), logit_tensor(logit_tensor),
+      label_tensor(label_tensor), optimizer_attrs(optimizer_attrs) {
+
+  // allocate each layer's tensors
+  for (layer_guid_t const &node :
+       topological_ordering(this->computation_graph)) {
+    this->training_backing.register_and_allocate_layer(node);
+    this->training_backing.allocate_layer_optimizer_tensors(
+        node, this->optimizer_attrs);
+  }
+}
+
+void ModelTrainingInstance::execute_init() {
+  for (layer_guid_t const &node :
+       topological_ordering(this->computation_graph)) {
+    this->training_backing.execute_init(node);
+  }
+}
+
+PerLayerElapsedTime ModelTrainingInstance::execute_forward() {
+  PerLayerElapsedTime per_layer_elapsed_time;
+  std::unordered_map<layer_guid_t, Future<std::optional<float>>>
+      per_layer_elapsed_time_future;
+  for (layer_guid_t const &node :
+       topological_ordering(this->computation_graph)) {
+    per_layer_elapsed_time_future.insert(
+        {node, this->training_backing.execute_forward(node)});
+  }
+  for (layer_guid_t const &node :
+       topological_ordering(this->computation_graph)) {
+    std::optional<float> elapsed_time =
+        per_layer_elapsed_time_future[node].get();
+    per_layer_elapsed_time.insert({node, elapsed_time});
+  }
+  return per_layer_elapsed_time;
+}
+
+PerLayerElapsedTime ModelTrainingInstance::execute_backward() {
+  this->training_backing.compute_loss(this->loss_attrs, this->logit_tensor,
+                                      this->label_tensor);
+  PerLayerElapsedTime per_layer_elapsed_time;
+  std::unordered_map<layer_guid_t, Future<std::optional<float>>>
+      per_layer_elapsed_time_future;
+  for (layer_guid_t const &node :
+       reversed(topological_ordering(this->computation_graph))) {
+    per_layer_elapsed_time_future.insert(
+        {node, this->training_backing.execute_backward(node)});
+  }
+  for (layer_guid_t const &node :
+       reversed(topological_ordering(this->computation_graph))) {
+    std::optional<float> elapsed_time =
+        per_layer_elapsed_time_future[node].get();
+    per_layer_elapsed_time.insert({node, elapsed_time});
+  }
+  return per_layer_elapsed_time;
+}
+
+void ModelTrainingInstance::execute_update() {
+  std::unordered_map<layer_guid_t, Future<void>> per_layer_update_future;
+  for (layer_guid_t const &node :
+       topological_ordering(this->computation_graph)) {
+    per_layer_update_future.insert(
+        {node, this->training_backing.execute_update(node, this->optimizer_attrs)});
+  }
+  for (layer_guid_t const &node :
+       topological_ordering(this->computation_graph)) {
+    per_layer_update_future[node].wait();
+  }
+  this->optimizer_attrs =
+      get_optimizer_attrs_for_next_iter(this->optimizer_attrs);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-backend/src/realm_allocator.cc b/lib/realm-backend/src/realm_allocator.cc
new file mode 100644
index 0000000000..fadc7f5719
--- /dev/null
+++ b/lib/realm-backend/src/realm_allocator.cc
@@ -0,0 +1,54 @@
+#include "realm-backend/realm_allocator.h"
+#include "utils/containers/contains_key.h"
+
+namespace FlexFlow {
+
+using namespace Realm;
+
+/*********** RealmAllocatorImpl ***********/
+
+RealmAllocatorImpl::RealmAllocatorImpl(Processor proc) : proc(proc) {
+  mem = Machine::MemoryQuery(Machine::get_machine())
+            .only_kind(Memory::GPU_FB_MEM)
+            .best_affinity_to(proc)
+            .first();
+}
+
+// TODO: now the region instance only corresponds to one tensor
+RealmRegion RealmAllocatorImpl::allocate(size_t requested_memory_size) {
+  Rect<1> bounds(Point<1>(0), Point<1>(requested_memory_size - 1));
+  RegionInstance requested_instance = RegionInstance::NO_INST;
+  RegionInstance::create_instance(requested_instance, mem, bounds, field_sizes,
+                                  /*SOA*/ 1, ProfilingRequestSet())
+      .wait();
+  void *ptr = requested_instance.pointer_untyped(0, 0);
+  this->ptrs.insert({requested_instance, ptr});
+  return {requested_instance, this};
+}
+
+void RealmAllocatorImpl::deallocate(RealmRegion region) {
+  if (region.allocator == this and contains_key(this->ptrs, region.instance)) {
+    RegionInstance instance = this->ptrs.at(region.instance);
+    instance.destroy();
+  } else {
+    throw std::runtime_error(
+        "Deallocating a pointer that was not allocated by this Allocator");
+  }
+}
+
+
+/*********** RealmAllocator ***********/
+
+RealmRegion RealmAllocator::allocate(size_t mem_size) {
+  return this->i_allocator->allocate(mem_size);
+}
+
+void RealmAllocator::deallocate(RealmRegion region) {
+  this->i_allocator->deallocate(region);
+}
+
+RealmAllocator create_realm_memory_allocator(Processor proc) {
+  return RealmAllocator::create<RealmAllocatorImpl>(proc);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-backend/src/realm_args_backing.cc b/lib/realm-backend/src/realm_args_backing.cc
new file mode 100644
index 0000000000..ae7022f4b0
--- /dev/null
+++ b/lib/realm-backend/src/realm_args_backing.cc
@@ -0,0 +1,65 @@
+#include "op-attrs/parallel_tensor_shape.h"
+#include "realm-backend/realm_args_backing.h"
+#include "utils/containers/contains_key.h"
+#include "utils/containers/map_values.h"
+#include "utils/overload.h"
+
+namespace FlexFlow {
+
+void RealmArgsBacking::add_per_device_op_state(
+    layer_guid_t const &op_guid, Future<DeviceSpecificDeviceStates> &&future) {
+  if (per_device_op_states.find(op_guid) != per_device_op_states.end()) {
+    throw mk_runtime_error("Op state already exists");
+  }
+  per_device_op_states.insert({op_guid, std::move(future)});
+}
+
+ArgSlotsBacking RealmArgsBacking::construct_arg_slots_backing(
+    TaskBinding const &binding) const {
+  return map_values(binding.get_arg_bindings(),
+                    [&](TaskArgSpec const &arg_binding) {
+                      return arg_binding.template visit<ConcreteArgSpec>(
+                          overload{[&](RuntimeArgRefSpec const &s) {
+                                     return this->lower_to_concrete_arg_spec(s);
+                                   },
+                                   [](ConcreteArgSpec const &s) { return s; }});
+                    });
+  ;
+}
+
+ConcreteArgSpec RealmArgsBacking::lower_to_concrete_arg_spec(
+    OpArgRefSpec const &op_arg_ref_spec, ComputationGraph const &cg,
+    layer_guid_t const &op_guid) const {
+  if (op_arg_ref_spec.holds<DeviceSpecificDeviceStates>()) {
+    assert(contains_key(this->per_device_op_states, op_guid));
+    DeviceSpecificDeviceStates device_specific =
+        per_device_op_states.at(op_guid);
+    PerDeviceOpState device_state =
+        get_device_state_from_device_specific(device_specific, 0);
+    return ConcreteArgSpec::create(device_state);
+  } else if (op_arg_ref_spec.holds<ParallelTensorShape>()) {
+    ParallelTensorShapeRefType index_op_arg_ref =
+        op_arg_ref_spec.get_ref_type().get<ParallelTensorShapeRefType>();
+    tensor_guid_t input_tensor =
+        get_incoming_inputs(cg, op_guid).at(index_op_arg_ref.idx);
+    TensorAttrs tensor_attrs = get_tensor_attrs(cg, input_tensor);
+    ParallelTensorShape shape = lift_to_parallel(tensor_attrs.shape);
+    return ConcreteArgSpec::create(shape);
+  } else {
+    throw mk_runtime_error("Unhandled op arg ref type");
+  }
+}
+
+ConcreteArgSpec RealmArgsBacking::lower_to_concrete_arg_spec(
+    RuntimeArgRefSpec const &runtime_arg_ref_spec) const {
+  if (runtime_arg_ref_spec.holds<DeviceSpecific<PerDeviceFFHandle>>()) {
+    return ConcreteArgSpec::create(
+        *(this->runtime_arg_config.ff_handle.get(0)));
+  } else if (runtime_arg_ref_spec.holds<ProfilingSettings>()) {
+    return ConcreteArgSpec::create(this->runtime_arg_config.profiling_settings);
+  } else {
+    throw mk_runtime_error("Unhandled runtime arg ref type");
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-backend/src/realm_task_argument_accessor.cc b/lib/realm-backend/src/realm_task_argument_accessor.cc
new file mode 100644
index 0000000000..7b27bad6c2
--- /dev/null
+++ b/lib/realm-backend/src/realm_task_argument_accessor.cc
@@ -0,0 +1,95 @@
+#include "realm-backend/realm_task_argument_accessor.h"
+#include "utils/containers/contains_key.h"
+#include "utils/containers/transform.h"
+#include "utils/hash/pair.h"
+#include "utils/overload.h"
+
+namespace FlexFlow {
+
+RealmTaskArgumentAccessor::RealmTaskArgumentAccessor(
+    Allocator const &allocator,
+    TensorSlotsBacking const &tensor_slots_backing,
+    ArgSlotsBacking const &arg_slots_backing)
+    : allocator(allocator), tensor_slots_backing(tensor_slots_backing),
+      arg_slots_backing(arg_slots_backing){};
+
+ConcreteArgSpec const &
+    RealmTaskArgumentAccessor::get_concrete_arg(slot_id_t name) const {
+  return this->arg_slots_backing.at(name);
+}
+
+GenericTensorAccessor RealmTaskArgumentAccessor::get_tensor(
+    slot_id_t slot, Permissions priv, TensorType tensor_type) const {
+  SlotTensorTypeId slot_tensor_type = SlotTensorTypeId{slot, tensor_type};
+  auto tensor_backing = std::get<GenericTensorAccessorW>(
+      this->tensor_slots_backing.at(slot_tensor_type));
+  if (priv == Permissions::RO) {
+    GenericTensorAccessorR readonly_tensor_backing = {
+        tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr};
+    return readonly_tensor_backing;
+  } else if (priv == Permissions::RW || priv == Permissions::WO) {
+    return tensor_backing;
+  } else {
+    throw mk_runtime_error(fmt::format("Unhandled privilege mode {}", priv));
+  }
+}
+VariadicGenericTensorAccessor RealmTaskArgumentAccessor::get_variadic_tensor(
+    slot_id_t slot, Permissions priv, TensorType tensor_type) const {
+  SlotTensorTypeId slot_tensor_type = SlotTensorTypeId{slot, tensor_type};
+  auto variadic_tensor_backing = std::get<std::vector<GenericTensorAccessorW>>(
+      this->tensor_slots_backing.at(slot_tensor_type));
+  if (priv == Permissions::RO) {
+    std::vector<GenericTensorAccessorR> readonly_variadic_tensor_backing = {};
+    for (GenericTensorAccessorW const &tensor_backing :
+         variadic_tensor_backing) {
+      readonly_variadic_tensor_backing.push_back(
+          {tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr});
+    }
+    return readonly_variadic_tensor_backing;
+  } else if (priv == Permissions::RW || priv == Permissions::WO) {
+    return variadic_tensor_backing;
+  } else {
+    throw mk_runtime_error(fmt::format("Unhandled privilege mode {}", priv));
+  }
+}
+
+Allocator RealmTaskArgumentAccessor::get_allocator() const {
+  return this->allocator;
+}
+
+TensorSlotsBackingWithoutAddresses
+    get_slots_backing_without_tensor_allocation_addresses(
+        TensorSlotsBacking const &slots_backing) {
+
+  TensorSlotsBackingWithoutAddresses addressless_slots_backing;
+
+  using TensorAccessorVariant =
+      std::variant<GenericTensorAccessorW, std::vector<GenericTensorAccessorW>>;
+  for (auto const &slot_tensor : slots_backing) {
+    TensorAccessorVariant accessor_variant = slot_tensor.second;
+    std::visit(
+        overload{
+            [&](GenericTensorAccessorW const &accessor) {
+              addressless_slots_backing.insert(
+                  {slot_tensor.first, get_shape_and_datatype(accessor)});
+            },
+            [&](std::vector<GenericTensorAccessorW> const &variadic_accessor) {
+              std::vector<std::pair<ArrayShape, DataType>>
+                  variadic_addressless_accessor =
+                      transform(variadic_accessor,
+                                [](GenericTensorAccessorW const &accessor) {
+                                  return get_shape_and_datatype(accessor);
+                                });
+              addressless_slots_backing.insert(
+                  {slot_tensor.first, variadic_addressless_accessor});
+            }},
+        accessor_variant);
+  }
+  return addressless_slots_backing;
+}
+
+size_t RealmTaskArgumentAccessor::get_device_idx() const {
+  return 0;
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-backend/src/realm_tensor_backing.cc b/lib/realm-backend/src/realm_tensor_backing.cc
new file mode 100644
index 0000000000..6edf6cf064
--- /dev/null
+++ b/lib/realm-backend/src/realm_tensor_backing.cc
@@ -0,0 +1,127 @@
+#include "realm-backend/realm_tensor_backing.h"
+#include "local-execution/tensor_lowering.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "op-attrs/tensor_shape.dtg.h"
+#include "pcg/computation_graph.h"
+#include "realm-backend/realm_allocator.h"
+#include "utils/containers/contains_key.h"
+#include "utils/overload.h"
+#include "local-execution/slot_grad_id.dtg.h"
+
+namespace FlexFlow {
+
+RealmTensorBacking::RealmTensorBacking() {};
+
+void RealmTensorBacking::allocate_layer_tensors(
+    layer_guid_t const &layer_guid,
+    ComputationGraph const &computation_graph,
+    RealmAllocator &allocator) {
+  this->allocate_tensors_by_role(
+      TensorRole::INPUT, layer_guid, computation_graph, allocator);
+  this->allocate_tensors_by_role(
+      TensorRole::WEIGHT, layer_guid, computation_graph, allocator);
+  this->allocate_tensors_by_role(
+      TensorRole::OUTPUT, layer_guid, computation_graph, allocator);
+}
+
+void RealmTensorBacking::allocate_tensors_by_role(
+    TensorRole const &role,
+    layer_guid_t const &layer_guid,
+    ComputationGraph const &computation_graph,
+    RealmAllocator &allocator) {
+  std::vector<tensor_guid_t> tensors;
+  switch (role) {
+    case TensorRole::INPUT:
+      tensors = get_incoming_inputs(computation_graph, layer_guid);
+      break;
+    case TensorRole::WEIGHT:
+      tensors = get_incoming_weights(computation_graph, layer_guid);
+      break;
+    case TensorRole::OUTPUT:
+      tensors = get_outgoing_tensors(computation_graph, layer_guid);
+      break;
+    default:
+      throw mk_runtime_error("Invalid tensor role, got {}", role);
+  }
+
+  for (tensor_guid_t const &tensor : tensors) {
+    TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor);
+    // tensor allocation
+    if (!contains_key(this->tensor_lowering_mapping, tensor)) {
+      lowered_tensor_t reduced_tensor = this->lowered_tensor_source.new_lowered_tensor();
+      this->tensor_lowering_mapping.insert({tensor, reduced_tensor});
+      RealmRegion region = allocator.allocate(get_size_in_bytes(tensor_attrs.shape));
+      this->tensor_regions.insert({reduced_tensor, region});
+      this->tensor_shapes.insert({reduced_tensor, tensor_attrs.shape});
+    }
+
+    // gradient tensor allocation
+    if (tensor_attrs.create_gradients == CreateGrad::YES && !contains_key(this->gradient_tensor_lowering_mapping, tensor)) {
+      lowered_tensor_t reduced_tensor = this->lowered_tensor_source.new_lowered_tensor();
+      this->gradient_tensor_lowering_mapping.insert({tensor, reduced_tensor});
+      RealmRegion region = allocator.allocate(get_size_in_bytes(tensor_attrs.shape));
+      this->tensor_regions.insert({reduced_tensor, region});
+      this->tensor_shapes.insert({reduced_tensor, tensor_attrs.shape});
+    }
+  }
+}
+
+void RealmTensorBacking::allocate_optimizer_tensors(
+    tensor_guid_t const &weight,
+    std::vector<optimizer_tensor_t> const& optimizer_tensors,
+    RealmAllocator &allocator) {
+  GenericTensorAccessorW weight_backing = this->get_tensor_backing(this->tensor_lowering_mapping.at(weight));
+  for (optimizer_tensor_t const & optimizer_tensor: optimizer_tensors) {
+    // optimizer tensor allocation
+    if (!contains_key(this->optimizer_tensor_lowering_mapping, optimizer_tensor)) {
+      lowered_tensor_t buffer_tensor = this->lowered_tensor_source.new_lowered_tensor();
+      this->optimizer_tensor_lowering_mapping.insert({optimizer_tensor, buffer_tensor});
+      TensorShape tensor_shape = get_tensor_shape(weight_backing.shape, weight_backing.data_type);
+      RealmRegion region = allocator.allocate(get_size_in_bytes(tensor_shape));
+      this->tensor_regions.insert({buffer_tensor, region});
+      this->tensor_shapes.insert({buffer_tensor, tensor_shape});
+    }
+  }
+}
+
+bool RealmTensorBacking::is_tensor_allocated(lowered_tensor_t const & tensor_id) const {
+  return contains_key(tensor_regions, tensor_id);
+}
+
+GenericTensorAccessorW const &RealmTensorBacking::get_tensor_backing(
+    lowered_tensor_t const &tensor_id) const {
+  void *ptr = this->tensor_regions.at(tensor_id).instance.pointer_untyped(0, 0);
+  TensorShape shape = this->tensor_shapes.at(tensor_id);
+  return {shape.data_type, ArrayShape{shape}, ptr};
+}
+
+TensorSlotsBacking RealmTensorBacking::construct_tensor_slots_backing(
+    TaskBinding const &binding) const {
+  TensorSlotsBacking mapping;
+
+  for (auto const &tensor_binding : binding.get_tensor_bindings()) {
+    SlotTensorTypeId slot_tensor_type_id = tensor_binding.first;
+
+    lowered_tensor_t tensor_id = [&] {
+      TensorTypeVariant tensor_type = tensor_binding.second;
+      if (tensor_type.has<tensor_guid_t>() and slot_tensor_type_id.tensor_type == TensorType::FORWARD) {
+        return this->tensor_lowering_mapping.at(tensor_type.get<tensor_guid_t>());
+      } else if (tensor_type.has<tensor_guid_t>() and slot_tensor_type_id.tensor_type == TensorType::GRADIENT) {
+        return this->gradient_tensor_lowering_mapping.at(tensor_type.get<tensor_guid_t>());
+      } else if (tensor_type.has<optimizer_tensor_t>()) {
+        return this->optimizer_tensor_lowering_mapping.at(tensor_type.get<optimizer_tensor_t>());
+      } else if (tensor_type.has<loss_tensor_t>()) {
+        return this->loss_tensor_lowering_mapping.at(tensor_type.get<loss_tensor_t>());
+      } else {
+        throw mk_runtime_error(fmt::format("Tensor binding has invalid type"));
+      }
+    }();
+
+    GenericTensorAccessorW accessor = this->get_tensor_backing(tensor_id);
+    mapping.insert({slot_tensor_type_id, accessor});
+  }
+
+  return mapping;
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc
index 46efb17bc1..24829a77b1 100644
--- a/lib/realm-backend/src/realm_training_backing.cc
+++ b/lib/realm-backend/src/realm_training_backing.cc
@@ -1,11 +1,14 @@
-#include "realm-backend/realm_training_backing.h"
 #include "local-execution/loss_functions.h"
 #include "local-execution/optimizer.h"
+#include "local-execution/task_id_t.dtg.h"
 #include "local-execution/task_invocation.h"
 #include "local-execution/task_signature_impl.h"
 #include "local-execution/tensor_lowering.h"
 #include "pcg/computation_graph.h"
 #include "pcg/optimizer_attrs.h"
+#include "realm-backend/realm_training_backing.h"
+#include "realm-backend/task_result.h"
+#include "realm-backend/task_wrapper.h"
 #include "utils/containers/contains.h"
 #include "utils/containers/contains_key.h"
 #include "utils/containers/get_only.h"
@@ -14,24 +17,47 @@
 
 namespace FlexFlow {
 
-LocalTrainingBacking::LocalTrainingBacking(
-    Allocator const &allocator,
+using namespace Realm;
+
+RealmTrainingBacking::RealmTrainingBacking(
     ComputationGraph const &computation_graph,
-    RuntimeArgConfig const &runtime_arg_config)
-    : allocator(allocator), computation_graph(computation_graph),
-      local_args_backing(runtime_arg_config),
-      task_registry(empty_task_registry()) {};
+    RuntimeArgConfig const &runtime_arg_config, Realm::Processor master_proc)
+    : computation_graph(computation_graph),
+      realm_args_backing(runtime_arg_config),
+      task_registry(empty_task_registry()) {
+  master_proc = master_proc;
+  proc_events.insert({master_proc, Realm::Event::NO_EVENT});
+  master_mem = Machine::MemoryQuery(Machine::get_machine())
+                   .only_kind(Memory::SYSTEM_MEM)
+                   .best_affinity_to(master_proc)
+                   .first();
+  Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine())
+                                   .only_kind(Processor::TOC_PROC);
+  for (Processor p : pq) {
+    worker_procs.push_back(p);
+    proc_events.insert({p, Realm::Event::NO_EVENT});
+    allocators.push_back(RealmAllocator(p));
+  }
+  assert(worker_procs.size() > 0);
+}
 
-void LocalTrainingBacking::register_and_allocate_layer(
+void RealmTrainingBacking::register_and_allocate_layer(
     layer_guid_t const &node) {
   ComputationGraphOpAttrs attrs =
       get_layer_attrs(this->computation_graph, node).attrs;
-  this->local_tensor_backing.allocate_layer_tensors(
-      node, this->computation_graph, this->allocator);
+  this->realm_tensor_backing.allocate_layer_tensors(
+      node, this->computation_graph, this->allocators[0]);
   register_tasks_for_layer(this->task_registry, node, attrs);
+  // TODO: multi gpu launching
+  std::vector<task_id_t> task_ids = get_task_ids(attrs);
+  for (task_id_t task_id : task_ids) {
+    TaskSignatureAndImpl task_signature_impl =
+        this->task_registry.task_mapping.at(task_id);
+    register_wrapper_tasks(worker_procs[0], task_id, task_signature_impl);
+  }
 }
 
-void LocalTrainingBacking::allocate_layer_optimizer_tensors(
+void RealmTrainingBacking::allocate_layer_optimizer_tensors(
     layer_guid_t const &node, OptimizerAttrs const &optimizer_attrs) {
   ComputationGraphOpAttrs attrs =
       get_layer_attrs(this->computation_graph, node).attrs;
@@ -41,165 +67,222 @@ void LocalTrainingBacking::allocate_layer_optimizer_tensors(
         get_only(get_outgoing_tensors(this->computation_graph, node));
 
     std::vector<optimizer_tensor_t> optimizer_tensors;
-    for (TensorTypeSlotSpec const & tensor_type_slot_spec: values(sig.tensor_guid_slots)) {
-      optimizer_tensors.push_back(this->optimizer_tensor_source.new_optimizer_tensor());
+    for (TensorTypeSlotSpec const &tensor_type_slot_spec :
+         values(sig.tensor_guid_slots)) {
+      optimizer_tensors.push_back(
+          this->optimizer_tensor_source.new_optimizer_tensor());
     }
     this->layer_optimizer_tensor_ids.insert({node, optimizer_tensors});
-    this->local_tensor_backing.allocate_optimizer_tensors(
-        weight_tensor, optimizer_tensors, this->allocator);
+    this->realm_tensor_backing.allocate_optimizer_tensors(
+        weight_tensor, optimizer_tensors, this->allocators[0]);
   }
 }
 
-DeviceSpecificDeviceStates
-    LocalTrainingBacking::call_init_task_impl(task_id_t task_id,
-                                              TaskArgumentAccessor const &acc) {
-  TaskSignatureAndImpl task_sig_impl =
-      this->task_registry.task_mapping.at(task_id);
-  auto fn =
-      task_sig_impl.impl_function.get<InitOpTaskImplFunction>().function_ptr;
-  return fn(acc);
-}
-
-std::optional<float>
-    LocalTrainingBacking::call_task_impl(task_id_t task_id,
-                                         TaskArgumentAccessor acc) {
-  TaskSignatureAndImpl task_sig_impl =
-      this->task_registry.task_mapping.at(task_id);
-  auto fn =
-      task_sig_impl.impl_function.get<FwdBwdOpTaskImplFunction>().function_ptr;
-  return fn(acc);
-}
-
-void LocalTrainingBacking::execute_init(layer_guid_t const &operator_node) {
-  if (registry_contains_task_for_layer(
-          this->task_registry, operator_node, OpTaskType::INIT)) {
+void RealmTrainingBacking::execute_init(layer_guid_t const &operator_node) {
+  if (registry_contains_task_for_layer(this->task_registry, operator_node,
+                                       OpTaskType::INIT)) {
     ComputationGraphOpAttrs attrs =
         get_layer_attrs(this->computation_graph, operator_node).attrs;
-
-    TaskInvocation invocation = this->lower_to_task_invocation(init(attrs));
-    TaskArgumentAccessor accessor =
-        this->get_task_arg_accessor(invocation);
-    DeviceSpecificDeviceStates device_state =
-        this->call_init_task_impl(invocation.task_id, accessor);
-    this->local_args_backing.add_per_device_op_state(operator_node,
-                                                      device_state);
+    TaskInvocation invocation =
+        this->lower_to_task_invocation(init(attrs), operator_node);
+    TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation);
+    task_id_t task_id = invocation.task_id;
+    TaskImplFunction impl_function =
+        this->task_registry.task_mapping.at(task_id).impl_function;
+    // TODO: multi gpu launching
+    Promise<DeviceSpecificDeviceStates> promise(master_mem);
+    Future<DeviceSpecificDeviceStates> future = promise.get_future();
+    RealmTaskArgs<DeviceSpecificDeviceStates> args{
+        task_id, impl_function, accessor, std::move(promise)};
+    Event e = worker_procs[0].spawn(static_cast<Processor::TaskFuncID>(task_id),
+                                    &args, sizeof(args),
+                                    proc_events[worker_procs[0]]);
+    proc_events[worker_procs[0]] = e;
+    future.set_event(e);
+    this->realm_args_backing.add_per_device_op_state(operator_node,
+                                                     std::move(future.get()));
   }
 }
 
-std::optional<float>
-    LocalTrainingBacking::execute_forward(layer_guid_t const &operator_node) {
-  if (registry_contains_task_for_layer(
-          this->task_registry, operator_node, OpTaskType::FWD)) {
+Future<std::optional<float>>
+RealmTrainingBacking::execute_forward(layer_guid_t const &operator_node) {
+  if (registry_contains_task_for_layer(this->task_registry, operator_node,
+                                       OpTaskType::FWD)) {
     ComputationGraphOpAttrs attrs =
         get_layer_attrs(this->computation_graph, operator_node).attrs;
-
-    TaskInvocation invocation = this->lower_to_task_invocation(forward(attrs));
-    TaskArgumentAccessor accessor =
-        this->get_task_arg_accessor(invocation);
-    return this->call_task_impl(invocation.task_id, accessor);
+    TaskInvocation invocation =
+        this->lower_to_task_invocation(forward(attrs), operator_node);
+    TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation);
+    task_id_t task_id = invocation.task_id;
+    TaskImplFunction impl_function =
+        this->task_registry.task_mapping.at(task_id).impl_function;
+    // TODO: multi gpu launching
+    Promise<std::optional<float>> promise(master_mem);
+    Future<std::optional<float>> future = promise.get_future();
+    RealmTaskArgs<std::optional<float>> args{task_id, impl_function, accessor,
+                                             std::move(promise)};
+    Event e = worker_procs[0].spawn(static_cast<Processor::TaskFuncID>(task_id),
+                                    &args, sizeof(args),
+                                    proc_events[worker_procs[0]]);
+    proc_events[worker_procs[0]] = e;
+    future.set_event(e);
+    return future;
   } else {
-    return std::nullopt;
+    return Future<std::optional<float>>(std::nullopt);
   }
 }
 
-void LocalTrainingBacking::compute_loss(LossAttrs const &loss_attrs,
-                                        tensor_guid_t const &logit_tensor,
-                                        loss_tensor_t const &label_tensor) {
-  TaskInvocation loss_invocation =
-      backward(loss_attrs, logit_tensor, label_tensor);
-  // TODO: https://github.com/flexflow/flexflow-train/issues/1442
-  // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation));
-  TaskArgumentAccessor loss_accessor =
-      this->get_task_arg_accessor(loss_invocation);
-  TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl();
-  loss_impl_fn.get<GenericTaskImplFunction>().function_ptr(loss_accessor);
-}
-
-std::optional<float>
-    LocalTrainingBacking::execute_backward(layer_guid_t const &operator_node) {
-  if (registry_contains_task_for_layer(
-          this->task_registry, operator_node, OpTaskType::BWD)) {
+Future<std::optional<float>>
+RealmTrainingBacking::execute_backward(layer_guid_t const &operator_node) {
+  if (registry_contains_task_for_layer(this->task_registry, operator_node,
+                                       OpTaskType::BWD)) {
     ComputationGraphOpAttrs attrs =
         get_layer_attrs(this->computation_graph, operator_node).attrs;
-
-    TaskInvocation invocation = this->lower_to_task_invocation(backward(attrs));
-    TaskArgumentAccessor accessor =
-        this->get_task_arg_accessor(invocation);
-    return this->call_task_impl(invocation.task_id, accessor);
+    TaskInvocation invocation =
+        this->lower_to_task_invocation(backward(attrs), operator_node);
+    TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation);
+    task_id_t task_id = invocation.task_id;
+    TaskImplFunction impl_function =
+        this->task_registry.task_mapping.at(task_id).impl_function;
+    // TODO: multi gpu launching
+    Promise<std::optional<float>> promise(master_mem);
+    Future<std::optional<float>> future = promise.get_future();
+    RealmTaskArgs<std::optional<float>> args{task_id, impl_function, accessor,
+                                             std::move(promise)};
+    Event e = worker_procs[0].spawn(static_cast<Processor::TaskFuncID>(task_id),
+                                    &args, sizeof(args),
+                                    proc_events[worker_procs[0]]);
+    proc_events[worker_procs[0]] = e;
+    future.set_event(e);
+    return future;
   } else {
-    return std::nullopt;
+    return Future<std::optional<float>>(std::nullopt);
   }
 }
 
-void LocalTrainingBacking::execute_update(
-    layer_guid_t const &node, OptimizerAttrs const &optimizer_attrs) {
+Future<void>
+RealmTrainingBacking::execute_update(layer_guid_t const &node,
+                                     OptimizerAttrs const &optimizer_attrs) {
   LayerAttrs layer_attrs = get_layer_attrs(this->computation_graph, node);
   if (layer_attrs.attrs.has<WeightAttrs>()) {
     // get tensors
-    tensor_guid_t weight_tensor = get_only(get_outgoing_tensors(this->computation_graph, node));
-    std::vector<optimizer_tensor_t> optimizer_buffer_tensors = this->layer_optimizer_tensor_ids.at(node);
-
+    tensor_guid_t weight_tensor =
+        get_only(get_outgoing_tensors(this->computation_graph, node));
+    std::vector<optimizer_tensor_t> optimizer_buffer_tensors =
+        this->layer_optimizer_tensor_ids.at(node);
     // get invocation
     TaskInvocation invocation = get_update_invocation(
         optimizer_attrs, weight_tensor, optimizer_buffer_tensors);
-
     // TODO: https://github.com/flexflow/flexflow-train/issues/1442
     // assert(is_invocation_valid(get_update_signature(attrs), invocation));
-
-    // execute update
-    TaskArgumentAccessor accessor =
-        this->get_task_arg_accessor(invocation);
+    TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation);
+    task_id_t task_id = invocation.task_id;
+    register_wrapper_tasks_generic(worker_procs[0], task_id);
     TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs);
-    update_impl_fn.get<GenericTaskImplFunction>().function_ptr(accessor);
+    // TODO: multi gpu launching
+    Promise<void> promise;
+    Future<void> future = promise.get_future();
+    RealmTaskArgs<void> args{task_id, update_impl_fn, accessor,
+                             std::move(promise)};
+    Event e = worker_procs[0].spawn(static_cast<Processor::TaskFuncID>(task_id),
+                                    &args, sizeof(args),
+                                    proc_events[worker_procs[0]]);
+    proc_events[worker_procs[0]] = e;
+    future.set_event(e);
+    return future;
+  } else {
+    return Future<void>();
   }
 }
 
-TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor(
+Future<void>
+RealmTrainingBacking::compute_loss(LossAttrs const &loss_attrs,
+                                   tensor_guid_t const &logit_tensor,
+                                   loss_tensor_t const &label_tensor) {
+  TaskInvocation loss_invocation =
+      backward(loss_attrs, logit_tensor, label_tensor);
+  // TODO: https://github.com/flexflow/flexflow-train/issues/1442
+  // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation));
+  TaskArgumentAccessor loss_accessor =
+      this->get_task_arg_accessor(loss_invocation);
+  task_id_t task_id = loss_invocation.task_id;
+  register_wrapper_tasks_generic(worker_procs[0], task_id);
+  TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl();
+  // TODO: multi gpu launching
+  Promise<void> promise;
+  Future<void> future = promise.get_future();
+  RealmTaskArgs<void> args{task_id, loss_impl_fn, loss_accessor,
+                           std::move(promise)};
+  Event e =
+      worker_procs[0].spawn(static_cast<Processor::TaskFuncID>(task_id), &args,
+                            sizeof(args), proc_events[worker_procs[0]]);
+  proc_events[worker_procs[0]] = e;
+  future.set_event(e);
+  return future;
+}
+
+TaskArgumentAccessor RealmTrainingBacking::get_task_arg_accessor(
     TaskInvocation const &invocation) const {
   TensorSlotsBacking tensor_slots_backing =
-      this->local_tensor_backing.construct_tensor_slots_backing(
+      this->realm_tensor_backing.construct_tensor_slots_backing(
           invocation.binding);
   ArgSlotsBacking arg_slots_backing =
-      this->local_args_backing.construct_arg_slots_backing(invocation.binding);
-  return TaskArgumentAccessor::create<LocalTaskArgumentAccessor>(
-      this->allocator, tensor_slots_backing, arg_slots_backing);
+      this->realm_args_backing.construct_arg_slots_backing(invocation.binding);
+  return TaskArgumentAccessor::create<RealmTaskArgumentAccessor>(
+      this->allocators[0], tensor_slots_backing, arg_slots_backing);
 }
 
-TaskInvocation LocalTrainingBacking::lower_to_task_invocation(OpTaskInvocation const & op_task_invocation, layer_guid_t const & layer_guid) const {
+TaskInvocation RealmTrainingBacking::lower_to_task_invocation(
+    OpTaskInvocation const &op_task_invocation,
+    layer_guid_t const &layer_guid) const {
   TaskBinding binding;
   // tensors
-  for (auto const & tensor_binding: op_task_invocation.binding.get_tensor_bindings()) {
-    tensor_guid_t tensor_to_bind = [&] {
+  for (auto const &tensor_binding :
+       op_task_invocation.binding.get_tensor_bindings()) {
+    tensor_guid_t tensor_to_bind = [&]() -> tensor_guid_t {
       switch (tensor_binding.second.role) {
-        case TensorRole::INPUT:
-          return get_incoming_inputs(this->computation_graph, layer_guid).at(tensor_binding.second.idx);
-        case TensorRole::OUTPUT:
-          return get_outgoing_tensors(this->computation_graph, layer_guid).at(tensor_binding.second.idx);
-        case TensorRole::WEIGHT:
-          return get_incoming_weights(this->computation_graph, layer_guid).at(tensor_binding.second.idx);
-        default:
-          throw mk_runtime_error(fmt::format("Invalid tensor role {}", tensor_binding.second.role));
+      case TensorRole::INPUT:
+        return get_incoming_inputs(this->computation_graph, layer_guid)
+            .at(tensor_binding.second.idx);
+      case TensorRole::OUTPUT:
+        return get_outgoing_tensors(this->computation_graph, layer_guid)
+            .at(tensor_binding.second.idx);
+      case TensorRole::WEIGHT:
+        return get_incoming_weights(this->computation_graph, layer_guid)
+            .at(tensor_binding.second.idx);
+      default:
+        throw mk_runtime_error(
+            fmt::format("Invalid tensor role {}", tensor_binding.second.role));
       }
-    }(); 
+    }();
 
     if (tensor_binding.first.is_grad == IsGrad::NO) {
       binding.bind(tensor_binding.first.slot_id, tensor_to_bind);
     } else if (tensor_binding.first.is_grad == IsGrad::YES) {
       binding.bind_grad(tensor_binding.first.slot_id, tensor_to_bind);
     } else {
-      throw mk_runtime_error(fmt::format("Invalid value for IsGrad {}", tensor_binding.first.is_grad));
+      throw mk_runtime_error(fmt::format("Invalid value for IsGrad {}",
+                                         tensor_binding.first.is_grad));
     }
   }
 
   // args
-  for (auto const & arg_binding: op_task_invocation.binding.get_arg_bindings()) {
+  for (auto const &arg_binding :
+       op_task_invocation.binding.get_arg_bindings()) {
     if (arg_binding.second.has<OpArgRefSpec>()) {
-      ConcreteArgSpec concrete_arg = this->local_args_backing.lower_to_concrete_arg_spec(arg_binding.second.get<OpArgRefSpec>(), this->computation_graph, layer_guid);
+      ConcreteArgSpec concrete_arg =
+          this->realm_args_backing.lower_to_concrete_arg_spec(
+              arg_binding.second.get<OpArgRefSpec>(), this->computation_graph,
+              layer_guid);
       binding.insert_arg_spec(arg_binding.first, TaskArgSpec{concrete_arg});
     } else if (arg_binding.second.has<RuntimeArgRefSpec>()) {
-      binding.insert_arg_spec(arg_binding.first, TaskArgSpec{arg_binding.second.get<RuntimeArgRefSpec>()});
+      binding.insert_arg_spec(
+          arg_binding.first,
+          TaskArgSpec{arg_binding.second.get<RuntimeArgRefSpec>()});
     } else {
-      binding.insert_arg_spec(arg_binding.first, TaskArgSpec{arg_binding.second.get<ConcreteArgSpec>()});
+      binding.insert_arg_spec(
+          arg_binding.first,
+          TaskArgSpec{arg_binding.second.get<ConcreteArgSpec>()});
     }
   }
 

From 8efaec7f2590bc4b8613c9f742910119d67df71a Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Wed, 5 Feb 2025 17:15:22 -0800
Subject: [PATCH 36/91] Build

---
 lib/kernels/include/kernels/array_shape.h     |  5 +--
 lib/kernels/include/kernels/legion_dim.h      |  2 +
 lib/kernels/src/allocation.cc                 |  1 -
 lib/kernels/src/array_shape.cc                | 38 +++++++---------
 lib/kernels/src/cuda/ops/concat_kernels.cu    |  3 +-
 lib/kernels/src/legion_dim.cc                 |  6 +++
 .../src/local-execution/ops/transpose.cc      | 28 +-----------
 .../src/local_cost_estimator.cc               |  2 +-
 lib/local-execution/src/loss_functions.cc     | 45 ++++++++++---------
 lib/local-execution/src/optimizer.cc          | 16 +++----
 lib/local-execution/src/task_registry.cc      |  6 +--
 11 files changed, 62 insertions(+), 90 deletions(-)

diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h
index 09e53582ea..95d20ceca3 100644
--- a/lib/kernels/include/kernels/array_shape.h
+++ b/lib/kernels/include/kernels/array_shape.h
@@ -18,7 +18,7 @@ struct ArrayShape {
   explicit ArrayShape(nonnegative_int *dims, nonnegative_int num_dims);
   explicit ArrayShape(TensorShape const &shape);
   explicit ArrayShape(std::vector<nonnegative_int> const &);
-  explicit ArrayShape(LegionTensorDims const &);
+  explicit ArrayShape(LegionOrdered<nonnegative_int> const &);
 
   /**
    * @brief Alias of ArrayShape::num_elements for compatibility with
@@ -53,9 +53,6 @@ struct ArrayShape {
   ArrayShape sub_shape(std::optional<legion_dim_t> start,
                        std::optional<legion_dim_t> end) const;
 
-  bool operator==(ArrayShape const &) const;
-  bool operator!=(ArrayShape const &) const;
-
 public:
   LegionOrdered<nonnegative_int> dims;
 
diff --git a/lib/kernels/include/kernels/legion_dim.h b/lib/kernels/include/kernels/legion_dim.h
index 7b9b9c455c..afab2d00b6 100644
--- a/lib/kernels/include/kernels/legion_dim.h
+++ b/lib/kernels/include/kernels/legion_dim.h
@@ -10,6 +10,8 @@ legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value);
 
 legion_dim_t legion_dim_from_ff_dim(ff_dim_t, nonnegative_int num_dimensions);
 
+ff_dim_t ff_dim_from_legion_dim(legion_dim_t, nonnegative_int num_dimensions);
+
 template <typename T>
 using LegionOrdered = DimOrdered<legion_dim_t, T>;
 
diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc
index cdc76371c8..114f817215 100644
--- a/lib/kernels/src/allocation.cc
+++ b/lib/kernels/src/allocation.cc
@@ -13,7 +13,6 @@ void Allocator::deallocate(void *ptr) {
 
 GenericTensorAccessorW
     Allocator::allocate_tensor(TensorShape const &tensor_shape) {
-  return {tensor_shape.data_type, ArrayShape{tensor_shape}, ptr};
   void *ptr =
       this->allocate(get_size_in_bytes(tensor_shape).unwrap_nonnegative());
   return {tensor_shape.data_type, ArrayShape{tensor_shape}, ptr};
diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
index fc35f47f3f..ea946b2882 100644
--- a/lib/kernels/src/array_shape.cc
+++ b/lib/kernels/src/array_shape.cc
@@ -22,7 +22,7 @@ ArrayShape::ArrayShape(TensorShape const &shape)
 ArrayShape::ArrayShape(std::vector<nonnegative_int> const &input_dims)
     : dims(input_dims) {}
 
-ArrayShape::ArrayShape(LegionTensorDims const &legion_tensor_dims)
+ArrayShape::ArrayShape(LegionOrdered<nonnegative_int> const &legion_tensor_dims)
     : dims(legion_tensor_dims) {}
 
 nonnegative_int ArrayShape::get_volume() const {
@@ -58,23 +58,23 @@ nonnegative_int ArrayShape::at(ff_dim_t idx) const {
 
 ArrayShape ArrayShape::sub_shape(std::optional<ff_dim_t> start,
                                  std::optional<ff_dim_t> end) const {
-  std::optional<legion_dim_t> legion_start =
+  return ArrayShape{legion_ordered_from_ff_ordered(slice(ff_ordered_from_legion_ordered(this->dims), start, end))};
+}
+
+ArrayShape ArrayShape::sub_shape(std::optional<legion_dim_t> start,
+                                 std::optional<legion_dim_t> end) const {
+  std::optional<ff_dim_t> legion_start =
       transform(start, [&](auto const &start_unwrapped) {
-        return legion_dim_from_ff_dim(start_unwrapped, num_dims());
+        return ff_dim_from_legion_dim(start_unwrapped, num_dims());
       });
 
-  std::optional<legion_dim_t> legion_end =
+  std::optional<ff_dim_t> legion_end =
       transform(end, [&](auto const &end_unwrapped) {
-        return legion_dim_from_ff_dim(end_unwrapped, num_dims());
+        return ff_dim_from_legion_dim(end_unwrapped, num_dims());
       });
   return this->sub_shape(legion_start, legion_end);
 }
 
-ArrayShape ArrayShape::sub_shape(std::optional<legion_dim_t> start,
-                                 std::optional<legion_dim_t> end) const {
-  return ArrayShape{slice(this->dims, start, end)};
-}
-
 bool ArrayShape::operator==(ArrayShape const &other) const {
   return this->tie() == other.tie();
 }
@@ -83,11 +83,11 @@ bool ArrayShape::operator!=(ArrayShape const &other) const {
   return this->tie() != other.tie();
 }
 
-ArrayShape ArrayShape::sub_shape(
-    std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
-    std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const {
-  NOT_IMPLEMENTED();
-}
+// ArrayShape ArrayShape::sub_shape(
+//     std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
+//     std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const {
+//   NOT_IMPLEMENTED();
+// }
 
 std::optional<nonnegative_int> ArrayShape::at_maybe(legion_dim_t index) const {
   if (index.value < dims.size()) {
@@ -114,14 +114,6 @@ TensorShape get_tensor_shape(ArrayShape const &shape, DataType dtype) {
                      dtype};
 }
 
-bool ArrayShape::operator==(ArrayShape const &other) const {
-  return this->dims == other.dims;
-}
-
-bool ArrayShape::operator!=(ArrayShape const &other) const {
-  return this->dims != other.dims;
-}
-
 std::string format_as(ArrayShape const &x) {
   std::ostringstream oss;
   oss << "<ArrayShape";
diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu
index 5e9f607bc8..8bb7d996b6 100644
--- a/lib/kernels/src/cuda/ops/concat_kernels.cu
+++ b/lib/kernels/src/cuda/ops/concat_kernels.cu
@@ -16,6 +16,7 @@
 #include "device.h"
 #include "kernels/concat_kernels.h"
 #include "kernels/legion_dim.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 #include <cassert>
 
 namespace FlexFlow {
@@ -27,7 +28,7 @@ void calc_blk_size(size_t &num_blocks,
                    ArrayShape const &shape,
                    ff_dim_t axis) {
   legion_dim_t axis_legion_dim = legion_dim_from_ff_dim(axis, shape.num_dims());
-  blk_size = shape.sub_shape(legion_dim_t{0}, axis_legion_dim).num_elements().unwrap_nonnegative();
+  blk_size = shape.sub_shape(legion_dim_t{nonnegative_int{0}}, axis_legion_dim).num_elements().unwrap_nonnegative();
   num_blocks = shape.sub_shape(axis, std::nullopt).num_elements().unwrap_nonnegative();
 }
 
diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/legion_dim.cc
index bbb15c5636..f89dd34d98 100644
--- a/lib/kernels/src/legion_dim.cc
+++ b/lib/kernels/src/legion_dim.cc
@@ -13,4 +13,10 @@ legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim,
                                       ff_dim.value.unwrap_nonnegative() - 1}};
 }
 
+ff_dim_t legion_dim_from_ff_dim(legion_dim_t legion_dim,
+                                    nonnegative_int num_dimensions) {
+  return ff_dim_t{nonnegative_int{num_dimensions.unwrap_nonnegative() -
+                                      legion_dim.value.unwrap_nonnegative() - 1}};
+}
+
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/transpose.cc b/lib/local-execution/src/local-execution/ops/transpose.cc
index a7647ebd18..bbe63bbed3 100644
--- a/lib/local-execution/src/local-execution/ops/transpose.cc
+++ b/lib/local-execution/src/local-execution/ops/transpose.cc
@@ -28,24 +28,8 @@ enum Slots {
   OUTPUT, // tensor
   ATTRS,
   PROFILING,
-  PER_DEVICE_STATE,
 };
 
-OpTaskInvocation init(TransposeAttrs const &attrs) {
-  OpTaskBinding binding;
-  binding.bind_arg(ATTRS, attrs);
-  return {task_id_t::TRANSPOSE_INIT_TASK_ID, binding};
-}
-
-static DeviceSpecificDeviceStates
-    init_task_impl(TaskArgumentAccessor const &acc) {
-  auto const &attrs = acc.get_argument<TransposeAttrs>(ATTRS);
-  std::vector<ff_dim_t> perm = inner_to_outer_idxs(attrs.perm);
-  TransposePerDeviceState per_device_state = init_kernel(perm.size(), perm);
-
-  return DeviceSpecificDeviceStates{
-      DeviceSpecific<TransposePerDeviceState>::create(per_device_state)};
-}
 
 OpTaskInvocation forward(TransposeAttrs const &attrs) {
   OpTaskBinding binding;
@@ -95,9 +79,6 @@ OpTaskInvocation backward(TransposeAttrs const &attrs) {
   return {task_id_t::TRANSPOSE_BWD_TASK_ID, binding};
 }
 
-TaskImplFunction get_transpose_init_task_impl() {
-  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
-}
 
 TaskImplFunction get_transpose_fwd_task_impl() {
   return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
@@ -107,13 +88,6 @@ TaskImplFunction get_transpose_bwd_task_impl() {
   return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
-OpTaskSignature get_transpose_init_signature() {
-  OpTaskSignature init(OpTaskType::INIT);
-
-  init.add_arg_slot<TransposeAttrs>(ATTRS);
-  init.add_return_value<TransposePerDeviceState>();
-  return init;
-}
 
 OpTaskSignature get_transpose_fwd_signature() {
   OpTaskSignature fwd(OpTaskType::FWD);
@@ -131,7 +105,7 @@ OpTaskSignature get_transpose_bwd_signature() {
 }
 
 std::vector<task_id_t> get_task_ids(TransposeAttrs const &) {
-  return {task_id_t::TRANSPOSE_INIT_TASK_ID, task_id_t::TRANSPOSE_FWD_TASK_ID, task_id_t::TRANSPOSE_BWD_TASK_ID};
+  return {task_id_t::TRANSPOSE_FWD_TASK_ID, task_id_t::TRANSPOSE_BWD_TASK_ID};
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
index ef01783eb7..12c8031654 100644
--- a/lib/local-execution/src/local_cost_estimator.cc
+++ b/lib/local-execution/src/local_cost_estimator.cc
@@ -19,7 +19,7 @@ namespace FlexFlow {
 LocalCostEstimator::LocalCostEstimator(RuntimeArgConfig const &config)
     : runtime_arg_config(config) {}
 
-static ComputationGraph const &
+static ComputationGraph
     create_computation_graph_for_local_cost_estimation(
         PCGOperatorAttrs const &op,
         std::vector<ParallelTensorShape> const &inputs,
diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc
index 93a792c466..a7fe68c995 100644
--- a/lib/local-execution/src/loss_functions.cc
+++ b/lib/local-execution/src/loss_functions.cc
@@ -17,6 +17,7 @@
 #include "kernels/loss_function_kernels.h"
 #include "local-execution/loss_functions.h"
 #include "local-execution/profiling.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 
 namespace FlexFlow {
 
@@ -54,35 +55,35 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
   auto logit_grad = acc.get_tensor_grad<Permissions::RW>(LOGIT_GRAD);
   auto logit = acc.get_tensor<Permissions::RO>(LOGIT);
   auto label = acc.get_loss_tensor<Permissions::RO>(LABEL);
-  int batch_size = logit.shape.at(legion_dim_t{1});
+  int batch_size = logit.shape.at(legion_dim_t{nonnegative_int{1}}).unwrap_nonnegative();
   // assuming logit shape is [batch dim, num classes]
 
   LossFunction loss_type = get_loss_function(attrs);
   float scale_factor = 1.0f / batch_size;
   if (loss_type == LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE) {
     assert(logit.shape.get_volume() == label.shape.get_volume());
-    scale_factor = 2.0f / logit.shape.get_volume();
+    scale_factor = 2.0f / logit.shape.get_volume().unwrap_nonnegative();
   }
 
   if (loss_type == LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY) {
     // label shape is [batch dim, 1]
     auto scce_attrs = attrs.get<SparseCategoricalCrossEntropyLossAttrs>();
-    size_t ndim = logit.shape.num_dims();
-    int num_classes = logit.shape.at(legion_dim_t{0});
+    size_t ndim = logit.shape.num_dims().unwrap_nonnegative();
+    int num_classes = logit.shape.at(legion_dim_t{nonnegative_int{0}}).unwrap_nonnegative();
     assert(logit_grad.shape == logit.shape);
     int k = 1;
     if (scce_attrs.replace_labels) {
-      k = logit.shape.at(legion_dim_t(ndim - 1)) /
+      k = logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1})).unwrap_nonnegative() /
           label.shape.at(legion_dim_t(
-              ndim - 1)); // TODO FIXME something seems wrong here, isn't the
+              nonnegative_int{ndim - 1})).unwrap_nonnegative(); // TODO FIXME something seems wrong here, isn't the
                           // numerator guaranteed to be 1? <--- this is not the
                           // case because of the potential parallel dim
     }
-    assert(label.shape.sub_shape(legion_dim_t(1), std::nullopt) ==
-           logit.shape.sub_shape(legion_dim_t(1), std::nullopt));
-    assert(k * label.shape.at(legion_dim_t(ndim - 1)) ==
-           logit.shape.at(legion_dim_t(ndim - 1)));
-    assert(label.shape.at(legion_dim_t(0)) == 1);
+    assert(label.shape.sub_shape(legion_dim_t(nonnegative_int{1}), std::nullopt) ==
+           logit.shape.sub_shape(legion_dim_t(nonnegative_int{1}), std::nullopt));
+    assert(k * label.shape.at(legion_dim_t(nonnegative_int{ndim - 1})).unwrap_nonnegative() ==
+           logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1})).unwrap_nonnegative());
+    assert(label.shape.at(legion_dim_t(nonnegative_int{0})).unwrap_nonnegative() == 1);
 
     profile(sparse_categorical_crossentropy_loss_backward_kernel,
             profiling,
@@ -90,8 +91,8 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
             get_float_ptr(logit_grad),
             get_float_ptr(logit),
             reinterpret_cast<int const *>(get_float_ptr(label)),
-            get_volume(logit.shape),
-            get_volume(logit_grad.shape),
+            get_volume(logit.shape).unwrap_nonnegative(),
+            get_volume(logit_grad.shape).unwrap_nonnegative(),
             batch_size,
             num_classes,
             k,
@@ -99,7 +100,7 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
   } else {
     assert(logit.shape == label.shape);
     assert(logit_grad.shape == logit.shape);
-    int num_channels = logit.shape.at(legion_dim_t{0});
+    int num_channels = logit.shape.at(legion_dim_t{nonnegative_int{0}}).unwrap_nonnegative();
     switch (loss_type) {
       case LossFunction::CATEGORICAL_CROSSENTROPY: {
         profile(categorical_crossentropy_loss_backward_kernel,
@@ -108,8 +109,8 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
                 get_float_ptr(logit_grad),
                 get_float_ptr(logit),
                 get_float_ptr(label),
-                get_volume(logit.shape),
-                get_volume(logit_grad.shape),
+                get_volume(logit.shape).unwrap_nonnegative(),
+                get_volume(logit_grad.shape).unwrap_nonnegative(),
                 scale_factor);
         break;
       }
@@ -120,8 +121,8 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
                 get_float_ptr(logit_grad),
                 get_float_ptr(logit),
                 get_float_ptr(label),
-                get_volume(logit.shape),
-                get_volume(logit_grad.shape),
+                get_volume(logit.shape).unwrap_nonnegative(),
+                get_volume(logit_grad.shape).unwrap_nonnegative(),
                 scale_factor);
         break;
       }
@@ -131,15 +132,15 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
                 "[IdentityLoss] backward_time = %.2lfms\n",
                 get_float_ptr(logit_grad),
                 get_float_ptr(logit),
-                get_volume(logit.shape),
-                get_volume(logit_grad.shape),
+                get_volume(logit.shape).unwrap_nonnegative(),
+                get_volume(logit_grad.shape).unwrap_nonnegative(),
                 scale_factor);
         break;
       }
       default:
-        throw mk_runtime_error(
+        throw mk_runtime_error(fmt::format(
             "Unsupported loss function {}. Please report this as an issue.",
-            loss_type);
+            loss_type));
     }
   }
 }
diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc
index 0c64147bd8..39c28fe83d 100644
--- a/lib/local-execution/src/optimizer.cc
+++ b/lib/local-execution/src/optimizer.cc
@@ -59,11 +59,11 @@ static void sgd_update_task_impl(TaskArgumentAccessor const &acc) {
   auto profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   assert(weight.shape == weight_grad.shape);
-  size_t size = weight_grad.shape.get_volume();
+  int size = weight_grad.shape.get_volume().unwrap_nonnegative();
 
-  assert(weight_grad.shape.get_volume() & weight.shape.get_volume() == 0);
-  size_t num_replicas =
-      weight_grad.shape.get_volume() / weight.shape.get_volume();
+  assert(weight_grad.shape.get_volume().unwrap_nonnegative() & weight.shape.get_volume().unwrap_nonnegative() == 0);
+  int num_replicas =
+      weight_grad.shape.get_volume().unwrap_nonnegative() / weight.shape.get_volume().unwrap_nonnegative();
 
   float *sgd_v_ptr;
   if (attrs.momentum > 0.0f) {
@@ -153,11 +153,11 @@ static void adam_update_task_impl(TaskArgumentAccessor const &acc) {
   auto profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
   assert(weight.shape == weight_grad.shape);
-  size_t size = weight_grad.shape.get_volume();
+  int size = weight_grad.shape.get_volume().unwrap_nonnegative();
 
-  assert(weight_grad.shape.get_volume() % weight.shape.get_volume() == 0);
-  size_t num_replicas =
-      weight_grad.shape.get_volume() / weight.shape.get_volume();
+  assert(weight_grad.shape.get_volume().unwrap_nonnegative() % weight.shape.get_volume().unwrap_nonnegative() == 0);
+  int num_replicas =
+      weight_grad.shape.get_volume().unwrap_nonnegative() / weight.shape.get_volume().unwrap_nonnegative();
 
   if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
     auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc
index 9b7b55633c..6e63fc7a1e 100644
--- a/lib/local-execution/src/task_registry.cc
+++ b/lib/local-execution/src/task_registry.cc
@@ -36,8 +36,8 @@ void register_tasks_for_layer(TaskRegistry &task_registry,
         task_registry.backward_task_ids[op_id] = task_id;
         break;
       default:
-        throw mk_runtime_error("Invalid OpTaskType, got {}",
-                               task_signature_impl.task_signature.type);
+        throw mk_runtime_error(fmt::format("Invalid OpTaskType, got {}",
+                               task_signature_impl.task_signature.type));
     }
     task_registry.task_mapping.insert({task_id, task_signature_impl});
   }
@@ -58,7 +58,7 @@ bool registry_contains_task_for_layer(TaskRegistry const &task_registry,
       task_ids = task_registry.backward_task_ids;
       break;
     default:
-      throw mk_runtime_error("Invalid OpTaskType, got {}", op_task_type);
+      throw mk_runtime_error(fmt::format("Invalid OpTaskType, got {}", op_task_type));
   }
 
   return task_ids.at(op).has_value();

From 1dc1398458c6c330b8aade003e5e114464c9dc1f Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Wed, 5 Feb 2025 17:25:16 -0800
Subject: [PATCH 37/91] Format

---
 lib/kernels/src/array_shape.cc                |  5 +--
 lib/kernels/src/cuda/ops/concat_kernels.cu    |  7 ++--
 lib/kernels/src/legion_dim.cc                 |  4 +--
 .../src/local-execution/ops/transpose.cc      |  3 --
 .../src/local_cost_estimator.cc               | 11 +++---
 lib/local-execution/src/loss_functions.cc     | 36 ++++++++++++-------
 lib/local-execution/src/optimizer.cc          | 15 ++++----
 lib/local-execution/src/task_registry.cc      |  8 +++--
 8 files changed, 52 insertions(+), 37 deletions(-)

diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
index ea946b2882..220f8ebeea 100644
--- a/lib/kernels/src/array_shape.cc
+++ b/lib/kernels/src/array_shape.cc
@@ -1,8 +1,8 @@
 #include "kernels/array_shape.h"
 #include "op-attrs/dim_ordered/slice.h"
 #include "utils/containers/product.h"
-#include "utils/containers/transform.h"
 #include "utils/containers/reversed.h"
+#include "utils/containers/transform.h"
 #include "utils/containers/vector_of.h"
 #include "utils/nonnegative_int/num_elements.h"
 
@@ -58,7 +58,8 @@ nonnegative_int ArrayShape::at(ff_dim_t idx) const {
 
 ArrayShape ArrayShape::sub_shape(std::optional<ff_dim_t> start,
                                  std::optional<ff_dim_t> end) const {
-  return ArrayShape{legion_ordered_from_ff_ordered(slice(ff_ordered_from_legion_ordered(this->dims), start, end))};
+  return ArrayShape{legion_ordered_from_ff_ordered(
+      slice(ff_ordered_from_legion_ordered(this->dims), start, end))};
 }
 
 ArrayShape ArrayShape::sub_shape(std::optional<legion_dim_t> start,
diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu
index 8bb7d996b6..0365764de1 100644
--- a/lib/kernels/src/cuda/ops/concat_kernels.cu
+++ b/lib/kernels/src/cuda/ops/concat_kernels.cu
@@ -28,8 +28,11 @@ void calc_blk_size(size_t &num_blocks,
                    ArrayShape const &shape,
                    ff_dim_t axis) {
   legion_dim_t axis_legion_dim = legion_dim_from_ff_dim(axis, shape.num_dims());
-  blk_size = shape.sub_shape(legion_dim_t{nonnegative_int{0}}, axis_legion_dim).num_elements().unwrap_nonnegative();
-  num_blocks = shape.sub_shape(axis, std::nullopt).num_elements().unwrap_nonnegative();
+  blk_size = shape.sub_shape(legion_dim_t{nonnegative_int{0}}, axis_legion_dim)
+                 .num_elements()
+                 .unwrap_nonnegative();
+  num_blocks =
+      shape.sub_shape(axis, std::nullopt).num_elements().unwrap_nonnegative();
 }
 
 void forward_kernel(cudaStream_t stream,
diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/legion_dim.cc
index f89dd34d98..23875ad916 100644
--- a/lib/kernels/src/legion_dim.cc
+++ b/lib/kernels/src/legion_dim.cc
@@ -14,9 +14,9 @@ legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim,
 }
 
 ff_dim_t legion_dim_from_ff_dim(legion_dim_t legion_dim,
-                                    nonnegative_int num_dimensions) {
+                                nonnegative_int num_dimensions) {
   return ff_dim_t{nonnegative_int{num_dimensions.unwrap_nonnegative() -
-                                      legion_dim.value.unwrap_nonnegative() - 1}};
+                                  legion_dim.value.unwrap_nonnegative() - 1}};
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/transpose.cc b/lib/local-execution/src/local-execution/ops/transpose.cc
index bbe63bbed3..eafde9461e 100644
--- a/lib/local-execution/src/local-execution/ops/transpose.cc
+++ b/lib/local-execution/src/local-execution/ops/transpose.cc
@@ -30,7 +30,6 @@ enum Slots {
   PROFILING,
 };
 
-
 OpTaskInvocation forward(TransposeAttrs const &attrs) {
   OpTaskBinding binding;
 
@@ -79,7 +78,6 @@ OpTaskInvocation backward(TransposeAttrs const &attrs) {
   return {task_id_t::TRANSPOSE_BWD_TASK_ID, binding};
 }
 
-
 TaskImplFunction get_transpose_fwd_task_impl() {
   return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
@@ -88,7 +86,6 @@ TaskImplFunction get_transpose_bwd_task_impl() {
   return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
-
 OpTaskSignature get_transpose_fwd_signature() {
   OpTaskSignature fwd(OpTaskType::FWD);
 
diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
index 12c8031654..85789c9505 100644
--- a/lib/local-execution/src/local_cost_estimator.cc
+++ b/lib/local-execution/src/local_cost_estimator.cc
@@ -19,12 +19,11 @@ namespace FlexFlow {
 LocalCostEstimator::LocalCostEstimator(RuntimeArgConfig const &config)
     : runtime_arg_config(config) {}
 
-static ComputationGraph
-    create_computation_graph_for_local_cost_estimation(
-        PCGOperatorAttrs const &op,
-        std::vector<ParallelTensorShape> const &inputs,
-        std::vector<ParallelTensorAttrs> const &weights,
-        std::vector<ParallelTensorAttrs> const &outputs) {
+static ComputationGraph create_computation_graph_for_local_cost_estimation(
+    PCGOperatorAttrs const &op,
+    std::vector<ParallelTensorShape> const &inputs,
+    std::vector<ParallelTensorAttrs> const &weights,
+    std::vector<ParallelTensorAttrs> const &outputs) {
   ComputationGraph computation_graph = make_empty_computation_graph();
 
   // create layer for inputs
diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc
index a7fe68c995..32b66629d3 100644
--- a/lib/local-execution/src/loss_functions.cc
+++ b/lib/local-execution/src/loss_functions.cc
@@ -55,7 +55,8 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
   auto logit_grad = acc.get_tensor_grad<Permissions::RW>(LOGIT_GRAD);
   auto logit = acc.get_tensor<Permissions::RO>(LOGIT);
   auto label = acc.get_loss_tensor<Permissions::RO>(LABEL);
-  int batch_size = logit.shape.at(legion_dim_t{nonnegative_int{1}}).unwrap_nonnegative();
+  int batch_size =
+      logit.shape.at(legion_dim_t{nonnegative_int{1}}).unwrap_nonnegative();
   // assuming logit shape is [batch dim, num classes]
 
   LossFunction loss_type = get_loss_function(attrs);
@@ -69,21 +70,29 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
     // label shape is [batch dim, 1]
     auto scce_attrs = attrs.get<SparseCategoricalCrossEntropyLossAttrs>();
     size_t ndim = logit.shape.num_dims().unwrap_nonnegative();
-    int num_classes = logit.shape.at(legion_dim_t{nonnegative_int{0}}).unwrap_nonnegative();
+    int num_classes =
+        logit.shape.at(legion_dim_t{nonnegative_int{0}}).unwrap_nonnegative();
     assert(logit_grad.shape == logit.shape);
     int k = 1;
     if (scce_attrs.replace_labels) {
-      k = logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1})).unwrap_nonnegative() /
-          label.shape.at(legion_dim_t(
-              nonnegative_int{ndim - 1})).unwrap_nonnegative(); // TODO FIXME something seems wrong here, isn't the
-                          // numerator guaranteed to be 1? <--- this is not the
-                          // case because of the potential parallel dim
+      k = logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1}))
+              .unwrap_nonnegative() /
+          label.shape.at(legion_dim_t(nonnegative_int{ndim - 1}))
+              .unwrap_nonnegative(); // TODO FIXME something seems wrong here,
+                                     // isn't the numerator guaranteed to be 1?
+                                     // <--- this is not the case because of the
+                                     // potential parallel dim
     }
-    assert(label.shape.sub_shape(legion_dim_t(nonnegative_int{1}), std::nullopt) ==
-           logit.shape.sub_shape(legion_dim_t(nonnegative_int{1}), std::nullopt));
-    assert(k * label.shape.at(legion_dim_t(nonnegative_int{ndim - 1})).unwrap_nonnegative() ==
-           logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1})).unwrap_nonnegative());
-    assert(label.shape.at(legion_dim_t(nonnegative_int{0})).unwrap_nonnegative() == 1);
+    assert(
+        label.shape.sub_shape(legion_dim_t(nonnegative_int{1}), std::nullopt) ==
+        logit.shape.sub_shape(legion_dim_t(nonnegative_int{1}), std::nullopt));
+    assert(k * label.shape.at(legion_dim_t(nonnegative_int{ndim - 1}))
+                   .unwrap_nonnegative() ==
+           logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1}))
+               .unwrap_nonnegative());
+    assert(
+        label.shape.at(legion_dim_t(nonnegative_int{0})).unwrap_nonnegative() ==
+        1);
 
     profile(sparse_categorical_crossentropy_loss_backward_kernel,
             profiling,
@@ -100,7 +109,8 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
   } else {
     assert(logit.shape == label.shape);
     assert(logit_grad.shape == logit.shape);
-    int num_channels = logit.shape.at(legion_dim_t{nonnegative_int{0}}).unwrap_nonnegative();
+    int num_channels =
+        logit.shape.at(legion_dim_t{nonnegative_int{0}}).unwrap_nonnegative();
     switch (loss_type) {
       case LossFunction::CATEGORICAL_CROSSENTROPY: {
         profile(categorical_crossentropy_loss_backward_kernel,
diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc
index 39c28fe83d..76da26433d 100644
--- a/lib/local-execution/src/optimizer.cc
+++ b/lib/local-execution/src/optimizer.cc
@@ -61,9 +61,10 @@ static void sgd_update_task_impl(TaskArgumentAccessor const &acc) {
   assert(weight.shape == weight_grad.shape);
   int size = weight_grad.shape.get_volume().unwrap_nonnegative();
 
-  assert(weight_grad.shape.get_volume().unwrap_nonnegative() & weight.shape.get_volume().unwrap_nonnegative() == 0);
-  int num_replicas =
-      weight_grad.shape.get_volume().unwrap_nonnegative() / weight.shape.get_volume().unwrap_nonnegative();
+  assert(weight_grad.shape.get_volume().unwrap_nonnegative() &
+         weight.shape.get_volume().unwrap_nonnegative() == 0);
+  int num_replicas = weight_grad.shape.get_volume().unwrap_nonnegative() /
+                     weight.shape.get_volume().unwrap_nonnegative();
 
   float *sgd_v_ptr;
   if (attrs.momentum > 0.0f) {
@@ -155,9 +156,11 @@ static void adam_update_task_impl(TaskArgumentAccessor const &acc) {
   assert(weight.shape == weight_grad.shape);
   int size = weight_grad.shape.get_volume().unwrap_nonnegative();
 
-  assert(weight_grad.shape.get_volume().unwrap_nonnegative() % weight.shape.get_volume().unwrap_nonnegative() == 0);
-  int num_replicas =
-      weight_grad.shape.get_volume().unwrap_nonnegative() / weight.shape.get_volume().unwrap_nonnegative();
+  assert(weight_grad.shape.get_volume().unwrap_nonnegative() %
+             weight.shape.get_volume().unwrap_nonnegative() ==
+         0);
+  int num_replicas = weight_grad.shape.get_volume().unwrap_nonnegative() /
+                     weight.shape.get_volume().unwrap_nonnegative();
 
   if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
     auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc
index 6e63fc7a1e..7b0c80a9bc 100644
--- a/lib/local-execution/src/task_registry.cc
+++ b/lib/local-execution/src/task_registry.cc
@@ -36,8 +36,9 @@ void register_tasks_for_layer(TaskRegistry &task_registry,
         task_registry.backward_task_ids[op_id] = task_id;
         break;
       default:
-        throw mk_runtime_error(fmt::format("Invalid OpTaskType, got {}",
-                               task_signature_impl.task_signature.type));
+        throw mk_runtime_error(
+            fmt::format("Invalid OpTaskType, got {}",
+                        task_signature_impl.task_signature.type));
     }
     task_registry.task_mapping.insert({task_id, task_signature_impl});
   }
@@ -58,7 +59,8 @@ bool registry_contains_task_for_layer(TaskRegistry const &task_registry,
       task_ids = task_registry.backward_task_ids;
       break;
     default:
-      throw mk_runtime_error(fmt::format("Invalid OpTaskType, got {}", op_task_type));
+      throw mk_runtime_error(
+          fmt::format("Invalid OpTaskType, got {}", op_task_type));
   }
 
   return task_ids.at(op).has_value();

From 17ad5c8855adf788146be53049a151a9785d84b1 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Wed, 5 Feb 2025 18:09:40 -0800
Subject: [PATCH 38/91] Split task spec files

---
 .proj.toml                                    |  1 +
 lib/CMakeLists.txt                            |  1 +
 lib/local-execution/CMakeLists.txt            |  1 +
 .../generic_task_impl_function.h              |  2 +-
 .../local-execution/gradient_tensor_source.h  |  2 +-
 .../init_op_task_impl_function.h              |  2 +-
 .../local-execution/itask_argument_accessor.h |  6 ++---
 .../local-execution/local_args_backing.h      |  8 +++----
 .../local-execution/local_cost_estimator.h    |  2 +-
 .../local_task_argument_accessor.h            |  2 +-
 .../local-execution/local_tensor_backing.h    | 12 +++++-----
 .../include/local-execution/loss_functions.h  |  6 ++---
 .../local-execution/loss_tensor_source.h      |  2 +-
 .../local-execution/lowered_tensor_source.h   |  2 +-
 .../local-execution/model_training_instance.h |  2 +-
 .../include/local-execution/ops/attention.h   |  4 ++--
 .../local-execution/ops/batch_matmul.h        |  6 ++---
 .../include/local-execution/ops/batch_norm.h  |  4 ++--
 .../include/local-execution/ops/cast.h        |  4 ++--
 .../include/local-execution/ops/combine.h     |  4 ++--
 .../include/local-execution/ops/concat.h      |  4 ++--
 .../include/local-execution/ops/conv_2d.h     |  4 ++--
 .../include/local-execution/ops/dropout.h     |  6 ++---
 .../local-execution/ops/element_binary.h      |  2 +-
 .../local-execution/ops/element_unary.h       |  4 ++--
 .../include/local-execution/ops/embedding.h   |  4 ++--
 .../include/local-execution/ops/flat.h        |  3 ++-
 .../include/local-execution/ops/gather.h      |  4 ++--
 .../include/local-execution/ops/input.h       |  2 +-
 .../include/local-execution/ops/layer_norm.h  |  4 ++--
 .../include/local-execution/ops/linear.h      |  4 ++--
 .../include/local-execution/ops/noop.h        |  2 +-
 .../include/local-execution/ops/pool_2d.h     |  4 ++--
 .../include/local-execution/ops/reduce.h      |  4 ++--
 .../include/local-execution/ops/reduction.h   |  4 ++--
 .../include/local-execution/ops/repartition.h |  4 ++--
 .../include/local-execution/ops/replicate.h   |  4 ++--
 .../include/local-execution/ops/reshape.h     |  4 ++--
 .../include/local-execution/ops/reverse.h     |  4 ++--
 .../include/local-execution/ops/softmax.h     |  4 ++--
 .../include/local-execution/ops/split.h       |  4 ++--
 .../include/local-execution/ops/topk.h        |  4 ++--
 .../include/local-execution/ops/transpose.h   |  4 ++--
 .../include/local-execution/ops/weight.h      |  2 +-
 .../include/local-execution/optimizer.h       |  4 ++--
 .../local-execution/optimizer_tensor_source.h |  2 +-
 .../include/local-execution/sim_environment.h |  2 +-
 .../local-execution/task_argument_accessor.h  |  4 ++--
 .../include/local-execution/task_registry.h   |  2 +-
 .../local-execution/task_registry.struct.toml |  2 +-
 .../local-execution/task_signature_impl.h     |  4 ++--
 .../task_signature_impl.struct.toml           |  2 +-
 .../include/local-execution/tasks.h           |  2 +-
 .../include/local-execution/tensor_lowering.h | 13 -----------
 .../src/local-execution/ops/attention.cc      |  2 +-
 .../src/local-execution/ops/batch_matmul.cc   |  2 +-
 .../src/local-execution/ops/cast.cc           |  2 +-
 .../src/local-execution/ops/combine.cc        |  2 +-
 .../src/local-execution/ops/concat.cc         |  4 ++--
 .../src/local-execution/ops/dropout.cc        |  4 ++--
 lib/local-execution/src/local_args_backing.cc |  2 +-
 .../src/local_cost_estimator.cc               |  2 +-
 .../src/local_tensor_backing.cc               |  4 ++--
 .../src/local_training_backing.cc             |  6 ++---
 lib/local-execution/src/loss_functions.cc     |  2 +-
 lib/local-execution/src/optimizer.cc          |  2 +-
 lib/local-execution/src/per_device_state.cc   |  2 +-
 lib/local-execution/src/task_binding.cc       |  2 +-
 lib/local-execution/src/tensor_lowering.cc    | 10 ---------
 .../test/src/test_local_slots_backing.cc      |  2 +-
 lib/local-execution/test/src/test_loss_e2e.cc |  2 +-
 .../test/src/test_update_e2e.cc               |  2 +-
 lib/task-spec/CMakeLists.txt                  | 16 ++++++++++++++
 .../include/task-spec}/arg_ref.h              |  2 +-
 .../include/task-spec}/concrete_arg.h         |  2 +-
 .../include/task-spec}/config.h               |  0
 .../include/task-spec}/device_specific.h      |  2 +-
 ...device_specific_device_states.variant.toml |  2 +-
 .../task-spec}/gradient_tensor_t.struct.toml  |  0
 .../include/task-spec}/is_grad.enum.toml      |  0
 .../include/task-spec}/is_trainable.enum.toml |  0
 .../task-spec}/loss_tensor_t.struct.toml      |  0
 .../task-spec}/lowered_tensor_t.struct.toml   |  0
 .../include/task-spec}/op_arg_ref.h           |  8 +++----
 .../task-spec}/op_arg_ref_type.variant.toml   |  4 ++--
 .../include/task-spec}/op_arg_spec.h          |  2 +-
 .../task-spec}/op_arg_spec.variant.toml       |  6 ++---
 .../task-spec}/op_slot_options.enum.toml      |  0
 .../include/task-spec}/op_task_invocation.h   | 22 +++++++++----------
 .../include/task-spec}/op_task_signature.h    | 14 ++++++------
 .../task-spec}/op_task_to_task_invocation.h   |  8 +++----
 .../include/task-spec}/op_task_type.enum.toml |  0
 .../op_tensor_slot_spec.struct.toml           | 10 ++++-----
 .../include/task-spec}/op_tensor_spec.h       |  2 +-
 .../task-spec}/optimizer_tensor_t.struct.toml |  0
 ...parallel_tensor_shape_ref_type.struct.toml |  0
 .../include/task-spec}/per_device_op_state.h  |  4 ++--
 .../per_device_op_state.variant.toml          |  0
 .../per_device_op_state_ref_type.struct.toml  |  0
 .../include/task-spec}/profiling.h            |  0
 .../include/task-spec}/runtime_arg_config.h   |  4 ++--
 .../include/task-spec}/runtime_arg_ref.h      |  8 +++----
 .../include/task-spec}/serialization.h        |  0
 .../task-spec}/slot_grad_id.struct.toml       |  4 ++--
 .../include/task-spec}/slot_id_t.struct.toml  |  0
 .../slot_tensor_type_id.struct.toml           |  4 ++--
 .../include/task-spec}/slot_type.enum.toml    |  0
 .../task-spec}/task_arg_spec.variant.toml     |  4 ++--
 .../include/task-spec}/task_binding.h         | 18 +++++++--------
 .../include/task-spec}/task_id_t.enum.toml    |  0
 .../include/task-spec}/task_invocation.h      |  2 +-
 .../task-spec}/task_invocation.struct.toml    |  4 ++--
 .../include/task-spec}/task_signature.h       |  2 +-
 .../task-spec}/task_signature.struct.toml     |  4 ++--
 .../include/task-spec}/tensor_role.enum.toml  |  0
 .../include/task-spec}/tensor_type.enum.toml  |  0
 .../tensor_type_slot_spec.struct.toml         |  6 ++---
 .../task-spec}/tensor_type_t.variant.toml     |  6 ++---
 .../include/task-spec}/variadic_tensor_ref.h  |  4 ++--
 .../src/concrete_arg.cc                       |  2 +-
 .../src/op_arg_ref.cc                         |  2 +-
 .../src/op_arg_spec.cc                        |  2 +-
 .../src/op_task_invocation.cc                 |  4 ++--
 .../src/op_task_signature.cc                  |  2 +-
 .../src/op_task_to_task_invocation.cc         |  2 +-
 .../src/op_tensor_spec.cc                     |  2 +-
 .../src/runtime_arg_ref.cc                    |  4 ++--
 .../src/task_invocation.cc                    |  2 +-
 .../src/task_signature.cc                     |  2 +-
 .../src/variadic_tensor_ref.cc                |  2 +-
 130 files changed, 225 insertions(+), 228 deletions(-)
 delete mode 100644 lib/local-execution/include/local-execution/tensor_lowering.h
 delete mode 100644 lib/local-execution/src/tensor_lowering.cc
 create mode 100644 lib/task-spec/CMakeLists.txt
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/arg_ref.h (97%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/concrete_arg.h (97%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/config.h (100%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/device_specific.h (97%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/device_specific_device_states.variant.toml (98%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/gradient_tensor_t.struct.toml (100%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/is_grad.enum.toml (100%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/is_trainable.enum.toml (100%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/loss_tensor_t.struct.toml (100%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/lowered_tensor_t.struct.toml (100%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/op_arg_ref.h (79%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/op_arg_ref_type.variant.toml (73%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/op_arg_spec.h (85%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/op_arg_spec.variant.toml (76%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/op_slot_options.enum.toml (100%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/op_task_invocation.h (86%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/op_task_signature.h (91%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/op_task_to_task_invocation.h (79%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/op_task_type.enum.toml (100%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/op_tensor_slot_spec.struct.toml (68%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/op_tensor_spec.h (92%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/optimizer_tensor_t.struct.toml (100%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/parallel_tensor_shape_ref_type.struct.toml (100%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/per_device_op_state.h (71%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/per_device_op_state.variant.toml (100%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/per_device_op_state_ref_type.struct.toml (100%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/profiling.h (100%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/runtime_arg_config.h (80%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/runtime_arg_ref.h (81%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/serialization.h (100%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/slot_grad_id.struct.toml (75%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/slot_id_t.struct.toml (100%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/slot_tensor_type_id.struct.toml (76%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/slot_type.enum.toml (100%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/task_arg_spec.variant.toml (77%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/task_binding.h (82%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/task_id_t.enum.toml (100%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/task_invocation.h (81%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/task_invocation.struct.toml (75%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/task_signature.h (97%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/task_signature.struct.toml (86%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/tensor_role.enum.toml (100%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/tensor_type.enum.toml (100%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/tensor_type_slot_spec.struct.toml (72%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/tensor_type_t.variant.toml (76%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/variadic_tensor_ref.h (81%)
 rename lib/{local-execution => task-spec}/src/concrete_arg.cc (94%)
 rename lib/{local-execution => task-spec}/src/op_arg_ref.cc (87%)
 rename lib/{local-execution => task-spec}/src/op_arg_spec.cc (83%)
 rename lib/{local-execution => task-spec}/src/op_task_invocation.cc (97%)
 rename lib/{local-execution => task-spec}/src/op_task_signature.cc (99%)
 rename lib/{local-execution => task-spec}/src/op_task_to_task_invocation.cc (98%)
 rename lib/{local-execution => task-spec}/src/op_tensor_spec.cc (89%)
 rename lib/{local-execution => task-spec}/src/runtime_arg_ref.cc (89%)
 rename lib/{local-execution => task-spec}/src/task_invocation.cc (77%)
 rename lib/{local-execution => task-spec}/src/task_signature.cc (93%)
 rename lib/{local-execution => task-spec}/src/variadic_tensor_ref.cc (75%)

diff --git a/.proj.toml b/.proj.toml
index 10307a6efa..94c2510671 100644
--- a/.proj.toml
+++ b/.proj.toml
@@ -12,6 +12,7 @@ build_targets = [
   "compiler",
   "substitution-generator",
   "local-execution",
+  "task-spec",
   "models",
   "export-model-arch",
   "substitution-to-dot",
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 972c656126..e2e561c384 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -4,6 +4,7 @@ add_subdirectory(runtime)
 add_subdirectory(op-attrs)
 add_subdirectory(kernels)
 add_subdirectory(local-execution)
+add_subdirectory(task-spec)
 add_subdirectory(utils)
 add_subdirectory(ffi)
 add_subdirectory(substitutions)
diff --git a/lib/local-execution/CMakeLists.txt b/lib/local-execution/CMakeLists.txt
index f649f86ce3..db0cf7603f 100644
--- a/lib/local-execution/CMakeLists.txt
+++ b/lib/local-execution/CMakeLists.txt
@@ -11,6 +11,7 @@ ff_add_library(
     op-attrs
     utils
     kernels
+    task-spec
     pcg
     spdlog
 )
diff --git a/lib/local-execution/include/local-execution/generic_task_impl_function.h b/lib/local-execution/include/local-execution/generic_task_impl_function.h
index 425740f61d..9ce22ecf54 100644
--- a/lib/local-execution/include/local-execution/generic_task_impl_function.h
+++ b/lib/local-execution/include/local-execution/generic_task_impl_function.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_GENERIC_TASK_IMPL_FUNCTION_H
 #define _FLEXFLOW_LOCAL_EXECUTION_GENERIC_TASK_IMPL_FUNCTION_H
 
-#include "local-execution/device_specific_device_states.dtg.h"
 #include "local-execution/task_argument_accessor.h"
+#include "task-spec/device_specific_device_states.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/gradient_tensor_source.h b/lib/local-execution/include/local-execution/gradient_tensor_source.h
index bb7a4c7aa8..e7d24d1ca5 100644
--- a/lib/local-execution/include/local-execution/gradient_tensor_source.h
+++ b/lib/local-execution/include/local-execution/gradient_tensor_source.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_GRADIENT_TENSOR_SOURCE_H
 #define _FLEXFLOW_LOCAL_EXECUTION_GRADIENT_TENSOR_SOURCE_H
 
-#include "local-execution/gradient_tensor_t.dtg.h"
+#include "task-spec/gradient_tensor_t.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/init_op_task_impl_function.h b/lib/local-execution/include/local-execution/init_op_task_impl_function.h
index 7b23a2bc64..0481e31a5f 100644
--- a/lib/local-execution/include/local-execution/init_op_task_impl_function.h
+++ b/lib/local-execution/include/local-execution/init_op_task_impl_function.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_INIT_TASK_IMPL_FUNCTION_H
 #define _FLEXFLOW_LOCAL_EXECUTION_INIT_TASK_IMPL_FUNCTION_H
 
-#include "local-execution/device_specific_device_states.dtg.h"
 #include "local-execution/task_argument_accessor.h"
+#include "task-spec/device_specific_device_states.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/itask_argument_accessor.h b/lib/local-execution/include/local-execution/itask_argument_accessor.h
index 9eff9460c2..24b3b3a37f 100644
--- a/lib/local-execution/include/local-execution/itask_argument_accessor.h
+++ b/lib/local-execution/include/local-execution/itask_argument_accessor.h
@@ -2,10 +2,10 @@
 #define _FLEXFLOW_LOCAL_EXECUTION_ITASK_ARGUMENT_ACCESSOR_H
 
 #include "kernels/allocation.h"
-#include "local-execution/concrete_arg.h"
-#include "local-execution/op_task_signature.h"
 #include "local-execution/privilege_tensor_accessor.h"
-#include "local-execution/tensor_type.dtg.h"
+#include "task-spec/concrete_arg.h"
+#include "task-spec/op_task_signature.h"
+#include "task-spec/tensor_type.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/local_args_backing.h b/lib/local-execution/include/local-execution/local_args_backing.h
index 6e6839fea7..4c9ede54fd 100644
--- a/lib/local-execution/include/local-execution/local_args_backing.h
+++ b/lib/local-execution/include/local-execution/local_args_backing.h
@@ -2,12 +2,12 @@
 #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ARGS_BACKING_H
 
 #include "local-execution/local_task_argument_accessor.h"
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/per_device_op_state.h"
-#include "local-execution/runtime_arg_config.h"
-#include "local-execution/task_invocation.dtg.h"
 #include "pcg/computation_graph.h"
 #include "pcg/layer_guid_t.dtg.h"
+#include "task-spec/op_task_invocation.h"
+#include "task-spec/per_device_op_state.h"
+#include "task-spec/runtime_arg_config.h"
+#include "task-spec/task_invocation.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/local_cost_estimator.h b/lib/local-execution/include/local-execution/local_cost_estimator.h
index 350d8f5abd..0189475fcb 100644
--- a/lib/local-execution/include/local-execution/local_cost_estimator.h
+++ b/lib/local-execution/include/local-execution/local_cost_estimator.h
@@ -2,7 +2,7 @@
 #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_COST_ESTIMATOR_H
 
 #include "local-execution/cost_estimate.h"
-#include "local-execution/runtime_arg_config.h"
+#include "task-spec/runtime_arg_config.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/local_task_argument_accessor.h b/lib/local-execution/include/local-execution/local_task_argument_accessor.h
index db0e98c2b1..b1e5a02985 100644
--- a/lib/local-execution/include/local-execution/local_task_argument_accessor.h
+++ b/lib/local-execution/include/local-execution/local_task_argument_accessor.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H
 #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H
 
-#include "local-execution/slot_tensor_type_id.dtg.h"
 #include "local-execution/task_argument_accessor.h"
+#include "task-spec/slot_tensor_type_id.dtg.h"
 #include <unordered_map>
 #include <variant>
 
diff --git a/lib/local-execution/include/local-execution/local_tensor_backing.h b/lib/local-execution/include/local-execution/local_tensor_backing.h
index 825ff0553e..9d35373784 100644
--- a/lib/local-execution/include/local-execution/local_tensor_backing.h
+++ b/lib/local-execution/include/local-execution/local_tensor_backing.h
@@ -6,19 +6,19 @@
 #include "local-execution/gradient_tensor_source.h"
 #include "local-execution/local_task_argument_accessor.h"
 #include "local-execution/loss_tensor_source.h"
-#include "local-execution/loss_tensor_t.dtg.h"
 #include "local-execution/lowered_tensor_source.h"
-#include "local-execution/lowered_tensor_t.dtg.h"
 #include "local-execution/optimizer_tensor_source.h"
-#include "local-execution/optimizer_tensor_t.dtg.h"
-#include "local-execution/task_invocation.dtg.h"
-#include "local-execution/tensor_role.dtg.h"
-#include "local-execution/tensor_type_t.dtg.h"
 #include "op-attrs/tensor_shape.dtg.h"
 #include "pcg/computation_graph.dtg.h"
 #include "pcg/layer_guid_t.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
 #include "pcg/tensor_guid_t.dtg.h"
+#include "task-spec/loss_tensor_t.dtg.h"
+#include "task-spec/lowered_tensor_t.dtg.h"
+#include "task-spec/optimizer_tensor_t.dtg.h"
+#include "task-spec/task_invocation.dtg.h"
+#include "task-spec/tensor_role.dtg.h"
+#include "task-spec/tensor_type_t.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/loss_functions.h b/lib/local-execution/include/local-execution/loss_functions.h
index b2a6d610c3..c06908503a 100644
--- a/lib/local-execution/include/local-execution/loss_functions.h
+++ b/lib/local-execution/include/local-execution/loss_functions.h
@@ -16,12 +16,12 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_
 #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_
 
-#include "local-execution/loss_tensor_t.dtg.h"
 #include "local-execution/task_impl_function.dtg.h"
-#include "local-execution/task_invocation.dtg.h"
-#include "local-execution/task_signature.h"
 #include "op-attrs/ops/loss_functions.h"
 #include "pcg/tensor_guid_t.dtg.h"
+#include "task-spec/loss_tensor_t.dtg.h"
+#include "task-spec/task_invocation.dtg.h"
+#include "task-spec/task_signature.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/loss_tensor_source.h b/lib/local-execution/include/local-execution/loss_tensor_source.h
index 2b55f1af01..d9858cde40 100644
--- a/lib/local-execution/include/local-execution/loss_tensor_source.h
+++ b/lib/local-execution/include/local-execution/loss_tensor_source.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOSS_TENSOR_SOURCE_H
 #define _FLEXFLOW_LOCAL_EXECUTION_LOSS_TENSOR_SOURCE_H
 
-#include "local-execution/loss_tensor_t.dtg.h"
+#include "task-spec/loss_tensor_t.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/lowered_tensor_source.h b/lib/local-execution/include/local-execution/lowered_tensor_source.h
index e4fc4ff56c..bd0b90dd75 100644
--- a/lib/local-execution/include/local-execution/lowered_tensor_source.h
+++ b/lib/local-execution/include/local-execution/lowered_tensor_source.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOWERED_TENSOR_SOURCE_H
 #define _FLEXFLOW_LOCAL_EXECUTION_LOWERED_TENSOR_SOURCE_H
 
-#include "local-execution/lowered_tensor_t.dtg.h"
+#include "task-spec/lowered_tensor_t.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h
index bf0fc1a3c0..c264418abc 100644
--- a/lib/local-execution/include/local-execution/model_training_instance.h
+++ b/lib/local-execution/include/local-execution/model_training_instance.h
@@ -2,9 +2,9 @@
 #define _FLEXFLOW_LOCAL_EXECUTION_MODEL_TRAINING_INSTANCE_H
 
 #include "local-execution/local_training_backing.h"
-#include "local-execution/loss_tensor_t.dtg.h"
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
 #include "pcg/tensor_guid_t.dtg.h"
+#include "task-spec/loss_tensor_t.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/attention.h b/lib/local-execution/include/local-execution/ops/attention.h
index 96f5aadcd9..bf5385f609 100644
--- a/lib/local-execution/include/local-execution/ops/attention.h
+++ b/lib/local-execution/include/local-execution/ops/attention.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_ATTENTION_H
 #define _FLEXFLOW_ATTENTION_H
 
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/sim_environment.h"
+#include "local-execution/task_impl_function.dtg.h"
 #include "op-attrs/ops/attention.h"
+#include "task-spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/batch_matmul.h b/lib/local-execution/include/local-execution/ops/batch_matmul.h
index 23389d5083..64d220ab66 100644
--- a/lib/local-execution/include/local-execution/ops/batch_matmul.h
+++ b/lib/local-execution/include/local-execution/ops/batch_matmul.h
@@ -1,10 +1,10 @@
 #ifndef _FLEXFLOW_BATCH_MATMUL_H
 #define _FLEXFLOW_BATCH_MATMUL_H
 
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/op_task_signature.h"
-#include "local-execution/sim_environment.h"
+#include "local-execution/task_impl_function.dtg.h"
 #include "op-attrs/ops/batch_matmul_attrs.dtg.h"
+#include "task-spec/op_task_invocation.h"
+#include "task-spec/op_task_signature.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/batch_norm.h b/lib/local-execution/include/local-execution/ops/batch_norm.h
index 36aa8ffa4e..85a7190ce1 100644
--- a/lib/local-execution/include/local-execution/ops/batch_norm.h
+++ b/lib/local-execution/include/local-execution/ops/batch_norm.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_BATCH_NORM_H
 #define _FLEXFLOW_BATCH_NORM_H
 
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/sim_environment.h"
+#include "local-execution/task_impl_function.dtg.h"
 #include "op-attrs/ops/batch_norm_attrs.dtg.h"
+#include "task-spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/cast.h b/lib/local-execution/include/local-execution/ops/cast.h
index e7af6aca6b..6a27ad267a 100644
--- a/lib/local-execution/include/local-execution/ops/cast.h
+++ b/lib/local-execution/include/local-execution/ops/cast.h
@@ -15,9 +15,9 @@
 #ifndef _FLEXFLOW_CAST_H
 #define _FLEXFLOW_CAST_H
 
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/sim_environment.h"
+#include "local-execution/task_impl_function.dtg.h"
 #include "op-attrs/ops/cast_attrs.dtg.h"
+#include "task-spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/combine.h b/lib/local-execution/include/local-execution/ops/combine.h
index e85e8fba39..00e9cbed2c 100644
--- a/lib/local-execution/include/local-execution/ops/combine.h
+++ b/lib/local-execution/include/local-execution/ops/combine.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_COMBINE_H
 #define _FLEXFLOW_COMBINE_H
 
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/sim_environment.h"
+#include "local-execution/task_impl_function.dtg.h"
 #include "op-attrs/ops/combine_attrs.dtg.h"
+#include "task-spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/concat.h b/lib/local-execution/include/local-execution/ops/concat.h
index eab70d621c..c46164e417 100644
--- a/lib/local-execution/include/local-execution/ops/concat.h
+++ b/lib/local-execution/include/local-execution/ops/concat.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_CONCAT_H
 #define _FLEXFLOW_CONCAT_H
 
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/sim_environment.h"
+#include "local-execution/task_impl_function.dtg.h"
 #include "op-attrs/ops/concat_attrs.dtg.h"
+#include "task-spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/conv_2d.h b/lib/local-execution/include/local-execution/ops/conv_2d.h
index 0358d71eea..f3bb34ffeb 100644
--- a/lib/local-execution/include/local-execution/ops/conv_2d.h
+++ b/lib/local-execution/include/local-execution/ops/conv_2d.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_CONV_2D_H
 #define _FLEXFLOW_CONV_2D_H
 
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/sim_environment.h"
+#include "local-execution/task_impl_function.dtg.h"
 #include "op-attrs/ops/conv_2d_attrs.dtg.h"
+#include "task-spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/dropout.h b/lib/local-execution/include/local-execution/ops/dropout.h
index a3dc5ff8af..bd7b426c6b 100644
--- a/lib/local-execution/include/local-execution/ops/dropout.h
+++ b/lib/local-execution/include/local-execution/ops/dropout.h
@@ -1,10 +1,10 @@
 #ifndef _FLEXFLOW_DROPOUT_H
 #define _FLEXFLOW_DROPOUT_H
 
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/sim_environment.h"
-#include "local-execution/task_id_t.dtg.h"
+#include "local-execution/task_impl_function.dtg.h"
 #include "op-attrs/ops/dropout_attrs.dtg.h"
+#include "task-spec/op_task_invocation.h"
+#include "task-spec/task_id_t.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/element_binary.h b/lib/local-execution/include/local-execution/ops/element_binary.h
index 72c0976df8..4e0bb46e47 100644
--- a/lib/local-execution/include/local-execution/ops/element_binary.h
+++ b/lib/local-execution/include/local-execution/ops/element_binary.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_ELEMENT_BINARY_H
 #define _FLEXFLOW_ELEMENT_BINARY_H
 
-#include "local-execution/sim_environment.h"
+#include "local-execution/task_impl_function.dtg.h"
 #include "local-execution/task_signature_impl.h"
 #include "op-attrs/ops/element_binary_attrs.dtg.h"
 
diff --git a/lib/local-execution/include/local-execution/ops/element_unary.h b/lib/local-execution/include/local-execution/ops/element_unary.h
index 04a72e2e12..9900668d6c 100644
--- a/lib/local-execution/include/local-execution/ops/element_unary.h
+++ b/lib/local-execution/include/local-execution/ops/element_unary.h
@@ -1,9 +1,9 @@
 #ifndef _ELEMENT_UNARY_H
 #define _ELEMENT_UNARY_H
 
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/sim_environment.h"
+#include "local-execution/task_impl_function.dtg.h"
 #include "op-attrs/ops/element_unary_attrs.dtg.h"
+#include "task-spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/embedding.h b/lib/local-execution/include/local-execution/ops/embedding.h
index 995d2296e1..b998aef53e 100644
--- a/lib/local-execution/include/local-execution/ops/embedding.h
+++ b/lib/local-execution/include/local-execution/ops/embedding.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_EMBEDDING_H
 #define _FLEXFLOW_EMBEDDING_H
 
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/sim_environment.h"
+#include "local-execution/task_impl_function.dtg.h"
 #include "op-attrs/ops/embedding_attrs.dtg.h"
+#include "task-spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/flat.h b/lib/local-execution/include/local-execution/ops/flat.h
index e019bfc654..95afb98340 100644
--- a/lib/local-execution/include/local-execution/ops/flat.h
+++ b/lib/local-execution/include/local-execution/ops/flat.h
@@ -1,8 +1,9 @@
 #ifndef _FLEXFLOW_FLAT_H
 #define _FLEXFLOW_FLAT_H
 
-#include "local-execution/sim_environment.h"
+#include "local-execution/task_impl_function.dtg.h"
 #include "op-attrs/ops/flat_attrs.dtg.h"
+#include "task-spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/gather.h b/lib/local-execution/include/local-execution/ops/gather.h
index e339683381..5569a94728 100644
--- a/lib/local-execution/include/local-execution/ops/gather.h
+++ b/lib/local-execution/include/local-execution/ops/gather.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_GATHER_H
 #define _FLEXFLOW_GATHER_H
 
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/sim_environment.h"
+#include "local-execution/task_impl_function.dtg.h"
 #include "op-attrs/ops/gather_attrs.dtg.h"
+#include "task-spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/input.h b/lib/local-execution/include/local-execution/ops/input.h
index baad25b798..9181478363 100644
--- a/lib/local-execution/include/local-execution/ops/input.h
+++ b/lib/local-execution/include/local-execution/ops/input.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_INPUT_H
 #define _FLEXFLOW_INPUT_H
 
-#include "local-execution/op_task_invocation.h"
 #include "op-attrs/ops/input_attrs.dtg.h"
+#include "task-spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/layer_norm.h b/lib/local-execution/include/local-execution/ops/layer_norm.h
index 8e034ac519..e4a15caac2 100644
--- a/lib/local-execution/include/local-execution/ops/layer_norm.h
+++ b/lib/local-execution/include/local-execution/ops/layer_norm.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_RUNTIME_SRC_OPS_LAYER_NORM_H
 #define _FLEXFLOW_RUNTIME_SRC_OPS_LAYER_NORM_H
 
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/sim_environment.h"
+#include "local-execution/task_impl_function.dtg.h"
 #include "op-attrs/ops/layer_norm_attrs.dtg.h"
+#include "task-spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/linear.h b/lib/local-execution/include/local-execution/ops/linear.h
index 2aaf13a95a..d58d876865 100644
--- a/lib/local-execution/include/local-execution/ops/linear.h
+++ b/lib/local-execution/include/local-execution/ops/linear.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_LINEAR_H
 #define _FLEXFLOW_LINEAR_H
 
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/sim_environment.h"
+#include "local-execution/task_impl_function.dtg.h"
 #include "op-attrs/ops/linear_attrs.dtg.h"
+#include "task-spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/noop.h b/lib/local-execution/include/local-execution/ops/noop.h
index 1097adeb5e..adbc15cd3b 100644
--- a/lib/local-execution/include/local-execution/ops/noop.h
+++ b/lib/local-execution/include/local-execution/ops/noop.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_NOOP_H
 #define _FLEXFLOW_NOOP_H
 
-#include "local-execution/op_task_invocation.h"
 #include "op-attrs/ops/noop_attrs.dtg.h"
+#include "task-spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/pool_2d.h b/lib/local-execution/include/local-execution/ops/pool_2d.h
index 908fd5462f..7d0ec44bd7 100644
--- a/lib/local-execution/include/local-execution/ops/pool_2d.h
+++ b/lib/local-execution/include/local-execution/ops/pool_2d.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_POOL_2D_H
 #define _FLEXFLOW_POOL_2D_H
 
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/sim_environment.h"
+#include "local-execution/task_impl_function.dtg.h"
 #include "op-attrs/ops/pool_2d_attrs.dtg.h"
+#include "task-spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/reduce.h b/lib/local-execution/include/local-execution/ops/reduce.h
index 7900c28159..5c6d4be338 100644
--- a/lib/local-execution/include/local-execution/ops/reduce.h
+++ b/lib/local-execution/include/local-execution/ops/reduce.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H
 #define _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H
 
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/sim_environment.h"
+#include "local-execution/task_impl_function.dtg.h"
 #include "op-attrs/ops/reduce_attrs.dtg.h"
+#include "task-spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/reduction.h b/lib/local-execution/include/local-execution/ops/reduction.h
index 56833602e6..7475d3aeb4 100644
--- a/lib/local-execution/include/local-execution/ops/reduction.h
+++ b/lib/local-execution/include/local-execution/ops/reduction.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_REDUCTION_H
 #define _FLEXFLOW_REDUCTION_H
 
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/sim_environment.h"
+#include "local-execution/task_impl_function.dtg.h"
 #include "op-attrs/ops/reduction_attrs.dtg.h"
+#include "task-spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/repartition.h b/lib/local-execution/include/local-execution/ops/repartition.h
index 5187d04ca0..08ecdafcf2 100644
--- a/lib/local-execution/include/local-execution/ops/repartition.h
+++ b/lib/local-execution/include/local-execution/ops/repartition.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_PARTITION_H
 #define _FLEXFLOW_PARTITION_H
 
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/sim_environment.h"
+#include "local-execution/task_impl_function.dtg.h"
 #include "op-attrs/ops/repartition_attrs.dtg.h"
+#include "task-spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/replicate.h b/lib/local-execution/include/local-execution/ops/replicate.h
index 85d1dff41a..b827b9c272 100644
--- a/lib/local-execution/include/local-execution/ops/replicate.h
+++ b/lib/local-execution/include/local-execution/ops/replicate.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_REPLICATE_H
 #define _FLEXFLOW_REPLICATE_H
 
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/sim_environment.h"
+#include "local-execution/task_impl_function.dtg.h"
 #include "op-attrs/ops/replicate_attrs.dtg.h"
+#include "task-spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/reshape.h b/lib/local-execution/include/local-execution/ops/reshape.h
index 37f07534ee..ed7e6e9e31 100644
--- a/lib/local-execution/include/local-execution/ops/reshape.h
+++ b/lib/local-execution/include/local-execution/ops/reshape.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_RESHAPE_H
 #define _FLEXFLOW_RESHAPE_H
 
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/sim_environment.h"
+#include "local-execution/task_impl_function.dtg.h"
 #include "op-attrs/ops/reshape_attrs.dtg.h"
+#include "task-spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/reverse.h b/lib/local-execution/include/local-execution/ops/reverse.h
index 7c16073be7..dd0e89ecad 100644
--- a/lib/local-execution/include/local-execution/ops/reverse.h
+++ b/lib/local-execution/include/local-execution/ops/reverse.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_REVERSE_H_
 #define _FLEXFLOW_REVERSE_H_
 
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/sim_environment.h"
+#include "local-execution/task_impl_function.dtg.h"
 #include "op-attrs/ops/reverse_attrs.dtg.h"
+#include "task-spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/softmax.h b/lib/local-execution/include/local-execution/ops/softmax.h
index d440fe7239..294d948b42 100644
--- a/lib/local-execution/include/local-execution/ops/softmax.h
+++ b/lib/local-execution/include/local-execution/ops/softmax.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_SOFTMAX_H
 #define _FLEXFLOW_SOFTMAX_H
 
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/sim_environment.h"
+#include "local-execution/task_impl_function.dtg.h"
 #include "op-attrs/ops/softmax_attrs.dtg.h"
+#include "task-spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/split.h b/lib/local-execution/include/local-execution/ops/split.h
index dde46c20bf..49cd7cfc7b 100644
--- a/lib/local-execution/include/local-execution/ops/split.h
+++ b/lib/local-execution/include/local-execution/ops/split.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_SPLIT_H
 #define _FLEXFLOW_SPLIT_H
 
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/sim_environment.h"
+#include "local-execution/task_impl_function.dtg.h"
 #include "op-attrs/ops/split_attrs.dtg.h"
+#include "task-spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/topk.h b/lib/local-execution/include/local-execution/ops/topk.h
index c8f3175ebd..aeded512cd 100644
--- a/lib/local-execution/include/local-execution/ops/topk.h
+++ b/lib/local-execution/include/local-execution/ops/topk.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_TOPK_H_
 #define _FLEXFLOW_TOPK_H_
 
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/sim_environment.h"
+#include "local-execution/task_impl_function.dtg.h"
 #include "op-attrs/ops/topk_attrs.dtg.h"
+#include "task-spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/transpose.h b/lib/local-execution/include/local-execution/ops/transpose.h
index f2ce014aa7..2c7b5fb3bc 100644
--- a/lib/local-execution/include/local-execution/ops/transpose.h
+++ b/lib/local-execution/include/local-execution/ops/transpose.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_TRANSPOSE_H_
 #define _FLEXFLOW_TRANSPOSE_H_
 
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/sim_environment.h"
+#include "local-execution/task_impl_function.dtg.h"
 #include "op-attrs/ops/transpose_attrs.dtg.h"
+#include "task-spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/ops/weight.h b/lib/local-execution/include/local-execution/ops/weight.h
index e59a88f07d..162236e41e 100644
--- a/lib/local-execution/include/local-execution/ops/weight.h
+++ b/lib/local-execution/include/local-execution/ops/weight.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_WEIGHT_H
 #define _FLEXFLOW_WEIGHT_H
 
-#include "local-execution/op_task_invocation.h"
 #include "op-attrs/ops/weight_attrs.dtg.h"
+#include "task-spec/op_task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h
index 3a092e34c6..f6bd5a3ee9 100644
--- a/lib/local-execution/include/local-execution/optimizer.h
+++ b/lib/local-execution/include/local-execution/optimizer.h
@@ -2,11 +2,11 @@
 #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_
 
 #include "local-execution/task_impl_function.dtg.h"
-#include "local-execution/task_invocation.dtg.h"
-#include "local-execution/task_signature.h"
 #include "pcg/optimizer_attrs.dtg.h"
 #include "pcg/optimizers/adam_optimizer_attrs.dtg.h"
 #include "pcg/optimizers/sgd_optimizer_attrs.dtg.h"
+#include "task-spec/task_invocation.dtg.h"
+#include "task-spec/task_signature.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/optimizer_tensor_source.h b/lib/local-execution/include/local-execution/optimizer_tensor_source.h
index 658c545225..7a5057c84a 100644
--- a/lib/local-execution/include/local-execution/optimizer_tensor_source.h
+++ b/lib/local-execution/include/local-execution/optimizer_tensor_source.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_OPTIMIZER_TENSOR_SOURCE_H
 #define _FLEXFLOW_LOCAL_EXECUTION_OPTIMIZER_TENSOR_SOURCE_H
 
-#include "local-execution/optimizer_tensor_t.dtg.h"
+#include "task-spec/optimizer_tensor_t.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/sim_environment.h b/lib/local-execution/include/local-execution/sim_environment.h
index 7c81cba408..6c2f8d4ebb 100644
--- a/lib/local-execution/include/local-execution/sim_environment.h
+++ b/lib/local-execution/include/local-execution/sim_environment.h
@@ -4,11 +4,11 @@
 #include "kernels/accessor.h"
 #include "kernels/allocation.h"
 #include "local-execution/cost_metrics.h"
-#include "local-execution/op_task_invocation.h"
 #include "local-execution/task_argument_accessor.h"
 #include "local-execution/task_signature_impl.h"
 #include "op-attrs/parallel_tensor_shape.dtg.h"
 #include "pcg/machine_view.h"
+#include "task-spec/op_task_invocation.h"
 #include <vector>
 
 namespace FlexFlow {
diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h
index 0cbeaf04c8..99c1c1296b 100644
--- a/lib/local-execution/include/local-execution/task_argument_accessor.h
+++ b/lib/local-execution/include/local-execution/task_argument_accessor.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H
 #define _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H
 
-#include "local-execution/device_specific.h"
 #include "local-execution/itask_argument_accessor.h"
-#include "local-execution/per_device_op_state.dtg.h"
+#include "task-spec/device_specific.h"
+#include "task-spec/per_device_op_state.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/task_registry.h b/lib/local-execution/include/local-execution/task_registry.h
index cb717ca2af..22cc344b3d 100644
--- a/lib/local-execution/include/local-execution/task_registry.h
+++ b/lib/local-execution/include/local-execution/task_registry.h
@@ -2,10 +2,10 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_REGISTRY_H
 #define _FLEXFLOW_LOCAL_EXECUTION_TASK_REGISTRY_H
 
-#include "local-execution/op_task_type.dtg.h"
 #include "local-execution/task_registry.dtg.h"
 #include "op-attrs/computation_graph_op_attrs.h"
 #include "pcg/computation_graph.dtg.h"
+#include "task-spec/op_task_type.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/task_registry.struct.toml b/lib/local-execution/include/local-execution/task_registry.struct.toml
index ada467a67d..c3784b617f 100644
--- a/lib/local-execution/include/local-execution/task_registry.struct.toml
+++ b/lib/local-execution/include/local-execution/task_registry.struct.toml
@@ -8,7 +8,7 @@ features = [
 
 includes = [
   "local-execution/task_signature_impl.dtg.h",
-  "local-execution/task_id_t.dtg.h",
+  "task-spec/task_id_t.dtg.h",
   "pcg/layer_guid_t.dtg.h",
 ]
 
diff --git a/lib/local-execution/include/local-execution/task_signature_impl.h b/lib/local-execution/include/local-execution/task_signature_impl.h
index 98c5c0cb3b..613a173f25 100644
--- a/lib/local-execution/include/local-execution/task_signature_impl.h
+++ b/lib/local-execution/include/local-execution/task_signature_impl.h
@@ -1,10 +1,10 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_IMPL_H
 #define _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_IMPL_H
 
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/task_id_t.dtg.h"
 #include "local-execution/task_signature_impl.dtg.h"
 #include "op-attrs/computation_graph_op_attrs.h"
+#include "task-spec/op_task_invocation.h"
+#include "task-spec/task_id_t.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/task_signature_impl.struct.toml b/lib/local-execution/include/local-execution/task_signature_impl.struct.toml
index 981794503b..78064203ec 100644
--- a/lib/local-execution/include/local-execution/task_signature_impl.struct.toml
+++ b/lib/local-execution/include/local-execution/task_signature_impl.struct.toml
@@ -8,7 +8,7 @@ features = [
 
 includes = [
   "local-execution/task_impl_function.dtg.h",
-  "local-execution/op_task_signature.h",
+  "task-spec/op_task_signature.h",
 ]
 
 [[fields]]
diff --git a/lib/local-execution/include/local-execution/tasks.h b/lib/local-execution/include/local-execution/tasks.h
index 4f5b26c43b..aae3b3fe44 100644
--- a/lib/local-execution/include/local-execution/tasks.h
+++ b/lib/local-execution/include/local-execution/tasks.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASKS_H
 #define _FLEXFLOW_LOCAL_EXECUTION_TASKS_H
 
-#include "local-execution/task_id_t.dtg.h"
+#include "task-spec/task_id_t.dtg.h"
 #include <optional>
 #include <string>
 #include <variant>
diff --git a/lib/local-execution/include/local-execution/tensor_lowering.h b/lib/local-execution/include/local-execution/tensor_lowering.h
deleted file mode 100644
index 5f3870c1d2..0000000000
--- a/lib/local-execution/include/local-execution/tensor_lowering.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_TENSOR_REDUCTION_H
-#define _FLEXFLOW_LOCAL_EXECUTION_TENSOR_REDUCTION_H
-
-#include "local-execution/lowered_tensor_t.dtg.h"
-#include "pcg/tensor_guid_t.dtg.h"
-
-namespace FlexFlow {
-
-lowered_tensor_t lower(tensor_guid_t const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/src/local-execution/ops/attention.cc b/lib/local-execution/src/local-execution/ops/attention.cc
index 6401d5beac..a9e6a9fa30 100644
--- a/lib/local-execution/src/local-execution/ops/attention.cc
+++ b/lib/local-execution/src/local-execution/ops/attention.cc
@@ -15,9 +15,9 @@
 
 #include "local-execution/ops/attention.h"
 #include "kernels/attention_kernels.h"
-#include "local-execution/op_task_signature.h"
 #include "op-attrs/ops/attention.h"
 #include "op-attrs/ops/attention/multihead_attention_parallel_inputs.h"
+#include "task-spec/op_task_signature.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/local-execution/ops/batch_matmul.cc b/lib/local-execution/src/local-execution/ops/batch_matmul.cc
index cd22cee283..c780ab6eca 100644
--- a/lib/local-execution/src/local-execution/ops/batch_matmul.cc
+++ b/lib/local-execution/src/local-execution/ops/batch_matmul.cc
@@ -15,9 +15,9 @@
 
 #include "local-execution/ops/batch_matmul.h"
 #include "kernels/batch_matmul_kernels.h"
-#include "local-execution/op_task_signature.h"
 #include "op-attrs/get_output_shapes.h"
 #include "op-attrs/ops/batch_matmul.h"
+#include "task-spec/op_task_signature.h"
 #include "utils/containers/transform.h"
 #include "utils/nonnegative_int/nonnegative_range.h"
 
diff --git a/lib/local-execution/src/local-execution/ops/cast.cc b/lib/local-execution/src/local-execution/ops/cast.cc
index 846faa9262..e5dd7f9c4e 100644
--- a/lib/local-execution/src/local-execution/ops/cast.cc
+++ b/lib/local-execution/src/local-execution/ops/cast.cc
@@ -16,7 +16,7 @@
 #include "local-execution/ops/cast.h"
 #include "kernels/cast_kernels.h"
 
-#include "local-execution/op_task_signature.h"
+#include "task-spec/op_task_signature.h"
 #include "utils/hash-utils.h"
 
 using namespace FlexFlow::Kernels::Cast;
diff --git a/lib/local-execution/src/local-execution/ops/combine.cc b/lib/local-execution/src/local-execution/ops/combine.cc
index b7e84878f4..32fab636d3 100644
--- a/lib/local-execution/src/local-execution/ops/combine.cc
+++ b/lib/local-execution/src/local-execution/ops/combine.cc
@@ -15,7 +15,7 @@
 
 #include "local-execution/ops/combine.h"
 #include "kernels/combine_kernels.h"
-#include "local-execution/op_task_invocation.h"
+#include "task-spec/op_task_invocation.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/src/local-execution/ops/concat.cc b/lib/local-execution/src/local-execution/ops/concat.cc
index dee1dd08e5..51370e7a4d 100644
--- a/lib/local-execution/src/local-execution/ops/concat.cc
+++ b/lib/local-execution/src/local-execution/ops/concat.cc
@@ -16,9 +16,9 @@
 #include "local-execution/ops/concat.h"
 #include "kernels/concat_kernels.h"
 
-#include "local-execution/op_task_signature.h"
-#include "local-execution/variadic_tensor_ref.h"
 #include "op-attrs/get_output_shapes.h"
+#include "task-spec/op_task_signature.h"
+#include "task-spec/variadic_tensor_ref.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/src/local-execution/ops/dropout.cc b/lib/local-execution/src/local-execution/ops/dropout.cc
index 017d023ec4..cef1ea6c93 100644
--- a/lib/local-execution/src/local-execution/ops/dropout.cc
+++ b/lib/local-execution/src/local-execution/ops/dropout.cc
@@ -1,8 +1,8 @@
 #include "local-execution/ops/dropout.h"
 #include "kernels/dropout_kernels.h"
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/op_task_signature.h"
 #include "op-attrs/get_output_shapes.h"
+#include "task-spec/op_task_invocation.h"
+#include "task-spec/op_task_signature.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/src/local_args_backing.cc b/lib/local-execution/src/local_args_backing.cc
index d8a94fb2c5..715a96efa6 100644
--- a/lib/local-execution/src/local_args_backing.cc
+++ b/lib/local-execution/src/local_args_backing.cc
@@ -1,6 +1,6 @@
 #include "local-execution/local_args_backing.h"
-#include "local-execution/op_task_to_task_invocation.h"
 #include "op-attrs/parallel_tensor_shape.h"
+#include "task-spec/op_task_to_task_invocation.h"
 #include "utils/containers/contains_key.h"
 #include "utils/containers/map_values.h"
 #include "utils/overload.h"
diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
index 85789c9505..31418c6bea 100644
--- a/lib/local-execution/src/local_cost_estimator.cc
+++ b/lib/local-execution/src/local_cost_estimator.cc
@@ -1,7 +1,7 @@
 #include "local-execution/local_cost_estimator.h"
 #include "kernels/device.h"
 #include "kernels/local_cuda_allocator.h"
-#include "local-execution/tensor_lowering.h"
+
 #include "local-execution/tracked_allocator.h"
 #include "op-attrs/computation_graph_op_attrs.h"
 #include "op-attrs/pcg_operator_attrs.h"
diff --git a/lib/local-execution/src/local_tensor_backing.cc b/lib/local-execution/src/local_tensor_backing.cc
index de058d88ad..00c170d501 100644
--- a/lib/local-execution/src/local_tensor_backing.cc
+++ b/lib/local-execution/src/local_tensor_backing.cc
@@ -1,6 +1,6 @@
 #include "local-execution/local_tensor_backing.h"
-#include "local-execution/slot_grad_id.dtg.h"
-#include "local-execution/tensor_lowering.h"
+#include "task-spec/slot_grad_id.dtg.h"
+
 #include "op-attrs/parallel_tensor_shape.h"
 #include "pcg/computation_graph.h"
 #include "pcg/optimizer_attrs.h"
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index 144596820a..2679a502e3 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -1,10 +1,10 @@
 #include "local-execution/local_training_backing.h"
 #include "local-execution/loss_functions.h"
-#include "local-execution/op_task_to_task_invocation.h"
 #include "local-execution/optimizer.h"
-#include "local-execution/task_invocation.h"
 #include "local-execution/task_signature_impl.h"
-#include "local-execution/tensor_lowering.h"
+#include "task-spec/op_task_to_task_invocation.h"
+#include "task-spec/task_invocation.h"
+
 #include "pcg/computation_graph.h"
 #include "pcg/optimizer_attrs.h"
 #include "utils/containers/contains.h"
diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc
index 32b66629d3..15ebdd5f28 100644
--- a/lib/local-execution/src/loss_functions.cc
+++ b/lib/local-execution/src/loss_functions.cc
@@ -16,7 +16,7 @@
 #include "op-attrs/ops/loss_functions.h"
 #include "kernels/loss_function_kernels.h"
 #include "local-execution/loss_functions.h"
-#include "local-execution/profiling.h"
+#include "task-spec/profiling.h"
 #include "utils/nonnegative_int/nonnegative_int.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc
index 76da26433d..a69ae9da61 100644
--- a/lib/local-execution/src/optimizer.cc
+++ b/lib/local-execution/src/optimizer.cc
@@ -1,6 +1,6 @@
 #include "local-execution/optimizer.h"
 #include "kernels/optimizer_kernels.h"
-#include "local-execution/profiling.h"
+#include "task-spec/profiling.h"
 #include "utils/overload.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/src/per_device_state.cc b/lib/local-execution/src/per_device_state.cc
index fa470b196d..a959f4a8c9 100644
--- a/lib/local-execution/src/per_device_state.cc
+++ b/lib/local-execution/src/per_device_state.cc
@@ -1,4 +1,4 @@
-#include "local-execution/per_device_op_state.h"
+#include "task-spec/per_device_op_state.h"
 #include "utils/overload.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/src/task_binding.cc b/lib/local-execution/src/task_binding.cc
index 7684511488..4537493c1d 100644
--- a/lib/local-execution/src/task_binding.cc
+++ b/lib/local-execution/src/task_binding.cc
@@ -1,4 +1,4 @@
-#include "local-execution/task_binding.h"
+#include "task-spec/task_binding.h"
 #include "pcg/tensor_guid_t.dtg.h"
 #include "utils/containers/contains_key.h"
 #include "utils/fmt/unordered_map.h"
diff --git a/lib/local-execution/src/tensor_lowering.cc b/lib/local-execution/src/tensor_lowering.cc
deleted file mode 100644
index 63be366d94..0000000000
--- a/lib/local-execution/src/tensor_lowering.cc
+++ /dev/null
@@ -1,10 +0,0 @@
-#include "local-execution/tensor_lowering.h"
-#include "utils/containers/transform.h"
-
-namespace FlexFlow {
-
-lowered_tensor_t lower(tensor_guid_t const &tensor_guid) {
-  return lowered_tensor_t{tensor_guid.raw_graph_output.node.raw_uid};
-}
-
-} // namespace FlexFlow
diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc
index 7568265446..e5ca58bc1f 100644
--- a/lib/local-execution/test/src/test_local_slots_backing.cc
+++ b/lib/local-execution/test/src/test_local_slots_backing.cc
@@ -2,7 +2,7 @@
 #include "local-execution/local_cost_estimator.h"
 #include "local-execution/local_cpu_allocator.h"
 #include "local-execution/local_tensor_backing.h"
-#include "local-execution/tensor_lowering.h"
+
 #include "op-attrs/ops/attention.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "pcg/computation_graph.h"
diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc
index 210cd1af83..62778c2e32 100644
--- a/lib/local-execution/test/src/test_loss_e2e.cc
+++ b/lib/local-execution/test/src/test_loss_e2e.cc
@@ -3,7 +3,7 @@
 #include "kernels/managed_ff_stream.h"
 #include "kernels/managed_per_device_ff_handle.h"
 #include "local-execution/local_training_backing.h"
-#include "local-execution/tensor_lowering.h"
+
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
 #include "pcg/computation_graph.h"
 #include "pcg/computation_graph_builder.h"
diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc
index d16c5e5b0b..4658a2a544 100644
--- a/lib/local-execution/test/src/test_update_e2e.cc
+++ b/lib/local-execution/test/src/test_update_e2e.cc
@@ -3,7 +3,7 @@
 #include "kernels/managed_ff_stream.h"
 #include "kernels/managed_per_device_ff_handle.h"
 #include "local-execution/local_training_backing.h"
-#include "local-execution/tensor_lowering.h"
+
 #include "pcg/computation_graph.h"
 #include "pcg/computation_graph_builder.h"
 #include "pcg/optimizer_attrs.dtg.h"
diff --git a/lib/task-spec/CMakeLists.txt b/lib/task-spec/CMakeLists.txt
new file mode 100644
index 0000000000..8deb20a593
--- /dev/null
+++ b/lib/task-spec/CMakeLists.txt
@@ -0,0 +1,16 @@
+ff_add_library(
+  NAME
+    task-spec
+  SRC_PATTERNS
+    src/*.cc
+  PUBLIC_INCLUDE
+    include/
+  PRIVATE_INCLUDE
+    src/
+  DEPS
+    op-attrs
+    utils
+    kernels
+    pcg
+    spdlog
+)
diff --git a/lib/local-execution/include/local-execution/arg_ref.h b/lib/task-spec/include/task-spec/arg_ref.h
similarity index 97%
rename from lib/local-execution/include/local-execution/arg_ref.h
rename to lib/task-spec/include/task-spec/arg_ref.h
index 75eecda273..8d3402c578 100644
--- a/lib/local-execution/include/local-execution/arg_ref.h
+++ b/lib/task-spec/include/task-spec/arg_ref.h
@@ -2,7 +2,7 @@
 #define _FLEXFLOW_LOCAL_EXECUTION_ARG_REF_H
 
 #include "kernels/ff_handle.h"
-// #include "local-execution/serialization.h
+// #include "task-spec/serialization.h
 #include "utils/type_index.h"
 #include "utils/visitable.h"
 
diff --git a/lib/local-execution/include/local-execution/concrete_arg.h b/lib/task-spec/include/task-spec/concrete_arg.h
similarity index 97%
rename from lib/local-execution/include/local-execution/concrete_arg.h
rename to lib/task-spec/include/task-spec/concrete_arg.h
index cee52ba4a2..7b2ece59a7 100644
--- a/lib/local-execution/include/local-execution/concrete_arg.h
+++ b/lib/task-spec/include/task-spec/concrete_arg.h
@@ -2,7 +2,7 @@
 #define _FLEXFLOW_LOCAL_EXECUTION_CONCRETE_ARG_H
 
 #include "fmt/format.h"
-#include "local-execution/serialization.h"
+#include "task-spec/serialization.h"
 #include "utils/hash-utils.h"
 #include "utils/type_index.h"
 #include <memory>
diff --git a/lib/local-execution/include/local-execution/config.h b/lib/task-spec/include/task-spec/config.h
similarity index 100%
rename from lib/local-execution/include/local-execution/config.h
rename to lib/task-spec/include/task-spec/config.h
diff --git a/lib/local-execution/include/local-execution/device_specific.h b/lib/task-spec/include/task-spec/device_specific.h
similarity index 97%
rename from lib/local-execution/include/local-execution/device_specific.h
rename to lib/task-spec/include/task-spec/device_specific.h
index 4035aaf7cf..3ef017f704 100644
--- a/lib/local-execution/include/local-execution/device_specific.h
+++ b/lib/task-spec/include/task-spec/device_specific.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_DEVICE_SPECIFIC_H
 #define _FLEXFLOW_LOCAL_EXECUTION_DEVICE_SPECIFIC_H
 
-#include "local-execution/serialization.h"
+#include "task-spec/serialization.h"
 #include "utils/exception.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/include/local-execution/device_specific_device_states.variant.toml b/lib/task-spec/include/task-spec/device_specific_device_states.variant.toml
similarity index 98%
rename from lib/local-execution/include/local-execution/device_specific_device_states.variant.toml
rename to lib/task-spec/include/task-spec/device_specific_device_states.variant.toml
index db476e771d..944dddc3df 100644
--- a/lib/local-execution/include/local-execution/device_specific_device_states.variant.toml
+++ b/lib/task-spec/include/task-spec/device_specific_device_states.variant.toml
@@ -22,7 +22,7 @@ includes = [
   "kernels/softmax_kernels.h",
   "kernels/topk_kernels.h",
   "kernels/transpose_kernels.h",
-  "local-execution/device_specific.h",
+  "task-spec/device_specific.h",
 ]
 
 [[values]]
diff --git a/lib/local-execution/include/local-execution/gradient_tensor_t.struct.toml b/lib/task-spec/include/task-spec/gradient_tensor_t.struct.toml
similarity index 100%
rename from lib/local-execution/include/local-execution/gradient_tensor_t.struct.toml
rename to lib/task-spec/include/task-spec/gradient_tensor_t.struct.toml
diff --git a/lib/local-execution/include/local-execution/is_grad.enum.toml b/lib/task-spec/include/task-spec/is_grad.enum.toml
similarity index 100%
rename from lib/local-execution/include/local-execution/is_grad.enum.toml
rename to lib/task-spec/include/task-spec/is_grad.enum.toml
diff --git a/lib/local-execution/include/local-execution/is_trainable.enum.toml b/lib/task-spec/include/task-spec/is_trainable.enum.toml
similarity index 100%
rename from lib/local-execution/include/local-execution/is_trainable.enum.toml
rename to lib/task-spec/include/task-spec/is_trainable.enum.toml
diff --git a/lib/local-execution/include/local-execution/loss_tensor_t.struct.toml b/lib/task-spec/include/task-spec/loss_tensor_t.struct.toml
similarity index 100%
rename from lib/local-execution/include/local-execution/loss_tensor_t.struct.toml
rename to lib/task-spec/include/task-spec/loss_tensor_t.struct.toml
diff --git a/lib/local-execution/include/local-execution/lowered_tensor_t.struct.toml b/lib/task-spec/include/task-spec/lowered_tensor_t.struct.toml
similarity index 100%
rename from lib/local-execution/include/local-execution/lowered_tensor_t.struct.toml
rename to lib/task-spec/include/task-spec/lowered_tensor_t.struct.toml
diff --git a/lib/local-execution/include/local-execution/op_arg_ref.h b/lib/task-spec/include/task-spec/op_arg_ref.h
similarity index 79%
rename from lib/local-execution/include/local-execution/op_arg_ref.h
rename to lib/task-spec/include/task-spec/op_arg_ref.h
index 102a8d4362..d95573787a 100644
--- a/lib/local-execution/include/local-execution/op_arg_ref.h
+++ b/lib/task-spec/include/task-spec/op_arg_ref.h
@@ -1,11 +1,11 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_ARG_REF_H
 #define _FLEXFLOW_LOCAL_EXECUTION_OP_ARG_REF_H
 
-#include "local-execution/arg_ref.h"
-#include "local-execution/device_specific.h"
-#include "local-execution/op_arg_ref_type.dtg.h"
-#include "local-execution/per_device_op_state.h"
 #include "op-attrs/parallel_tensor_shape.dtg.h"
+#include "task-spec/arg_ref.h"
+#include "task-spec/device_specific.h"
+#include "task-spec/op_arg_ref_type.dtg.h"
+#include "task-spec/per_device_op_state.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/op_arg_ref_type.variant.toml b/lib/task-spec/include/task-spec/op_arg_ref_type.variant.toml
similarity index 73%
rename from lib/local-execution/include/local-execution/op_arg_ref_type.variant.toml
rename to lib/task-spec/include/task-spec/op_arg_ref_type.variant.toml
index cd226da161..e0452c6ce2 100644
--- a/lib/local-execution/include/local-execution/op_arg_ref_type.variant.toml
+++ b/lib/task-spec/include/task-spec/op_arg_ref_type.variant.toml
@@ -9,8 +9,8 @@ features = [
 ]
 
 includes = [
-  "local-execution/per_device_op_state_ref_type.dtg.h",
-  "local-execution/parallel_tensor_shape_ref_type.dtg.h",
+  "task-spec/per_device_op_state_ref_type.dtg.h",
+  "task-spec/parallel_tensor_shape_ref_type.dtg.h",
 ]
 
 [[values]]
diff --git a/lib/local-execution/include/local-execution/op_arg_spec.h b/lib/task-spec/include/task-spec/op_arg_spec.h
similarity index 85%
rename from lib/local-execution/include/local-execution/op_arg_spec.h
rename to lib/task-spec/include/task-spec/op_arg_spec.h
index 4f3ccd066e..1dc4efcdd1 100644
--- a/lib/local-execution/include/local-execution/op_arg_spec.h
+++ b/lib/task-spec/include/task-spec/op_arg_spec.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OP_ARG_SPEC_H
 #define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OP_ARG_SPEC_H
 
-#include "local-execution/op_arg_spec.dtg.h"
+#include "task-spec/op_arg_spec.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/op_arg_spec.variant.toml b/lib/task-spec/include/task-spec/op_arg_spec.variant.toml
similarity index 76%
rename from lib/local-execution/include/local-execution/op_arg_spec.variant.toml
rename to lib/task-spec/include/task-spec/op_arg_spec.variant.toml
index 28169902ae..e52e5c914e 100644
--- a/lib/local-execution/include/local-execution/op_arg_spec.variant.toml
+++ b/lib/task-spec/include/task-spec/op_arg_spec.variant.toml
@@ -10,9 +10,9 @@ features = [
 ]
 
 includes = [
-  "local-execution/concrete_arg.h",
-  "local-execution/op_arg_ref.h",
-  "local-execution/runtime_arg_ref.h",
+  "task-spec/concrete_arg.h",
+  "task-spec/op_arg_ref.h",
+  "task-spec/runtime_arg_ref.h",
 ]
 
 [[values]]
diff --git a/lib/local-execution/include/local-execution/op_slot_options.enum.toml b/lib/task-spec/include/task-spec/op_slot_options.enum.toml
similarity index 100%
rename from lib/local-execution/include/local-execution/op_slot_options.enum.toml
rename to lib/task-spec/include/task-spec/op_slot_options.enum.toml
diff --git a/lib/local-execution/include/local-execution/op_task_invocation.h b/lib/task-spec/include/task-spec/op_task_invocation.h
similarity index 86%
rename from lib/local-execution/include/local-execution/op_task_invocation.h
rename to lib/task-spec/include/task-spec/op_task_invocation.h
index 0f351c3a0e..cce0a4d6a6 100644
--- a/lib/local-execution/include/local-execution/op_task_invocation.h
+++ b/lib/task-spec/include/task-spec/op_task_invocation.h
@@ -2,17 +2,17 @@
 #define _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_INVOCATION_H
 
 #include "kernels/accessor.h"
-#include "local-execution/concrete_arg.h"
-#include "local-execution/is_trainable.dtg.h"
-#include "local-execution/op_arg_ref.h"
-#include "local-execution/op_arg_spec.dtg.h"
-#include "local-execution/op_task_signature.h"
-#include "local-execution/op_tensor_spec.h"
-#include "local-execution/profiling.h"
-#include "local-execution/runtime_arg_ref.h"
-#include "local-execution/slot_grad_id.dtg.h"
-#include "local-execution/task_id_t.dtg.h"
-#include "local-execution/variadic_tensor_ref.h"
+#include "task-spec/concrete_arg.h"
+#include "task-spec/is_trainable.dtg.h"
+#include "task-spec/op_arg_ref.h"
+#include "task-spec/op_arg_spec.dtg.h"
+#include "task-spec/op_task_signature.h"
+#include "task-spec/op_tensor_spec.h"
+#include "task-spec/profiling.h"
+#include "task-spec/runtime_arg_ref.h"
+#include "task-spec/slot_grad_id.dtg.h"
+#include "task-spec/task_id_t.dtg.h"
+#include "task-spec/variadic_tensor_ref.h"
 #include <typeindex>
 #include <unordered_map>
 #include <unordered_set>
diff --git a/lib/local-execution/include/local-execution/op_task_signature.h b/lib/task-spec/include/task-spec/op_task_signature.h
similarity index 91%
rename from lib/local-execution/include/local-execution/op_task_signature.h
rename to lib/task-spec/include/task-spec/op_task_signature.h
index 0447644354..eba0023906 100644
--- a/lib/local-execution/include/local-execution/op_task_signature.h
+++ b/lib/task-spec/include/task-spec/op_task_signature.h
@@ -1,13 +1,13 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_SIGNATURE_H
 #define _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_SIGNATURE_H
 
-#include "local-execution/is_grad.dtg.h"
-#include "local-execution/op_task_type.dtg.h"
-#include "local-execution/op_tensor_slot_spec.dtg.h"
-#include "local-execution/serialization.h"
-#include "local-execution/slot_id_t.dtg.h"
-#include "local-execution/slot_type.dtg.h"
-#include "local-execution/task_id_t.dtg.h"
+#include "task-spec/is_grad.dtg.h"
+#include "task-spec/op_task_type.dtg.h"
+#include "task-spec/op_tensor_slot_spec.dtg.h"
+#include "task-spec/serialization.h"
+#include "task-spec/slot_id_t.dtg.h"
+#include "task-spec/slot_type.dtg.h"
+#include "task-spec/task_id_t.dtg.h"
 #include "utils/hash/unordered_map.h"
 #include "utils/hash/unordered_set.h"
 #include "utils/type_index.h"
diff --git a/lib/local-execution/include/local-execution/op_task_to_task_invocation.h b/lib/task-spec/include/task-spec/op_task_to_task_invocation.h
similarity index 79%
rename from lib/local-execution/include/local-execution/op_task_to_task_invocation.h
rename to lib/task-spec/include/task-spec/op_task_to_task_invocation.h
index 02b3c938b0..0c5fdb39a4 100644
--- a/lib/local-execution/include/local-execution/op_task_to_task_invocation.h
+++ b/lib/task-spec/include/task-spec/op_task_to_task_invocation.h
@@ -1,12 +1,12 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_TO_TASK_INVOCATION_H
 #define _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_TO_TASK_INVOCATION_H
 
-#include "local-execution/device_specific_device_states.dtg.h"
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/runtime_arg_config.h"
-#include "local-execution/task_invocation.dtg.h"
 #include "pcg/computation_graph.dtg.h"
 #include "pcg/layer_guid_t.dtg.h"
+#include "task-spec/device_specific_device_states.dtg.h"
+#include "task-spec/op_task_invocation.h"
+#include "task-spec/runtime_arg_config.h"
+#include "task-spec/task_invocation.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/op_task_type.enum.toml b/lib/task-spec/include/task-spec/op_task_type.enum.toml
similarity index 100%
rename from lib/local-execution/include/local-execution/op_task_type.enum.toml
rename to lib/task-spec/include/task-spec/op_task_type.enum.toml
diff --git a/lib/local-execution/include/local-execution/op_tensor_slot_spec.struct.toml b/lib/task-spec/include/task-spec/op_tensor_slot_spec.struct.toml
similarity index 68%
rename from lib/local-execution/include/local-execution/op_tensor_slot_spec.struct.toml
rename to lib/task-spec/include/task-spec/op_tensor_slot_spec.struct.toml
index 590dbe6362..109ddf36af 100644
--- a/lib/local-execution/include/local-execution/op_tensor_slot_spec.struct.toml
+++ b/lib/task-spec/include/task-spec/op_tensor_slot_spec.struct.toml
@@ -8,11 +8,11 @@ features = [
 ]
 
 includes = [
-  "local-execution/slot_id_t.dtg.h",
-  "local-execution/slot_type.dtg.h",
-  "local-execution/tensor_role.dtg.h",
-  "local-execution/is_grad.dtg.h",
-  "local-execution/op_slot_options.dtg.h",
+  "task-spec/slot_id_t.dtg.h",
+  "task-spec/slot_type.dtg.h",
+  "task-spec/tensor_role.dtg.h",
+  "task-spec/is_grad.dtg.h",
+  "task-spec/op_slot_options.dtg.h",
 ]
 
 [[fields]]
diff --git a/lib/local-execution/include/local-execution/op_tensor_spec.h b/lib/task-spec/include/task-spec/op_tensor_spec.h
similarity index 92%
rename from lib/local-execution/include/local-execution/op_tensor_spec.h
rename to lib/task-spec/include/task-spec/op_tensor_spec.h
index 29d6cef628..c957704a10 100644
--- a/lib/local-execution/include/local-execution/op_tensor_spec.h
+++ b/lib/task-spec/include/task-spec/op_tensor_spec.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TENSOR_SPEC_REF_H
 #define _FLEXFLOW_LOCAL_EXECUTION_OP_TENSOR_SPEC_REF_H
 
-#include "local-execution/op_task_signature.h"
+#include "task-spec/op_task_signature.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/optimizer_tensor_t.struct.toml b/lib/task-spec/include/task-spec/optimizer_tensor_t.struct.toml
similarity index 100%
rename from lib/local-execution/include/local-execution/optimizer_tensor_t.struct.toml
rename to lib/task-spec/include/task-spec/optimizer_tensor_t.struct.toml
diff --git a/lib/local-execution/include/local-execution/parallel_tensor_shape_ref_type.struct.toml b/lib/task-spec/include/task-spec/parallel_tensor_shape_ref_type.struct.toml
similarity index 100%
rename from lib/local-execution/include/local-execution/parallel_tensor_shape_ref_type.struct.toml
rename to lib/task-spec/include/task-spec/parallel_tensor_shape_ref_type.struct.toml
diff --git a/lib/local-execution/include/local-execution/per_device_op_state.h b/lib/task-spec/include/task-spec/per_device_op_state.h
similarity index 71%
rename from lib/local-execution/include/local-execution/per_device_op_state.h
rename to lib/task-spec/include/task-spec/per_device_op_state.h
index 1edd5b6360..23312d90a5 100644
--- a/lib/local-execution/include/local-execution/per_device_op_state.h
+++ b/lib/task-spec/include/task-spec/per_device_op_state.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H
 #define _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H
 
-#include "local-execution/device_specific_device_states.dtg.h"
-#include "local-execution/per_device_op_state.dtg.h"
+#include "task-spec/device_specific_device_states.dtg.h"
+#include "task-spec/per_device_op_state.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/per_device_op_state.variant.toml b/lib/task-spec/include/task-spec/per_device_op_state.variant.toml
similarity index 100%
rename from lib/local-execution/include/local-execution/per_device_op_state.variant.toml
rename to lib/task-spec/include/task-spec/per_device_op_state.variant.toml
diff --git a/lib/local-execution/include/local-execution/per_device_op_state_ref_type.struct.toml b/lib/task-spec/include/task-spec/per_device_op_state_ref_type.struct.toml
similarity index 100%
rename from lib/local-execution/include/local-execution/per_device_op_state_ref_type.struct.toml
rename to lib/task-spec/include/task-spec/per_device_op_state_ref_type.struct.toml
diff --git a/lib/local-execution/include/local-execution/profiling.h b/lib/task-spec/include/task-spec/profiling.h
similarity index 100%
rename from lib/local-execution/include/local-execution/profiling.h
rename to lib/task-spec/include/task-spec/profiling.h
diff --git a/lib/local-execution/include/local-execution/runtime_arg_config.h b/lib/task-spec/include/task-spec/runtime_arg_config.h
similarity index 80%
rename from lib/local-execution/include/local-execution/runtime_arg_config.h
rename to lib/task-spec/include/task-spec/runtime_arg_config.h
index 31b3479a14..f4320bc40b 100644
--- a/lib/local-execution/include/local-execution/runtime_arg_config.h
+++ b/lib/task-spec/include/task-spec/runtime_arg_config.h
@@ -2,8 +2,8 @@
 #define _FLEXFLOW_LOCAL_EXECUTION_RUNTIME_ARG_CONFIG_H
 
 #include "kernels/ff_handle.h"
-#include "local-execution/device_specific.h"
-#include "local-execution/profiling.h"
+#include "task-spec/device_specific.h"
+#include "task-spec/profiling.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/runtime_arg_ref.h b/lib/task-spec/include/task-spec/runtime_arg_ref.h
similarity index 81%
rename from lib/local-execution/include/local-execution/runtime_arg_ref.h
rename to lib/task-spec/include/task-spec/runtime_arg_ref.h
index a225a813df..33fccb0106 100644
--- a/lib/local-execution/include/local-execution/runtime_arg_ref.h
+++ b/lib/task-spec/include/task-spec/runtime_arg_ref.h
@@ -1,10 +1,10 @@
 #ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_RUNTIME_ARG_REF_H
 #define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_RUNTIME_ARG_REF_H
 
-#include "local-execution/arg_ref.h"
-#include "local-execution/config.h"
-#include "local-execution/device_specific.h"
-#include "local-execution/profiling.h"
+#include "task-spec/arg_ref.h"
+#include "task-spec/config.h"
+#include "task-spec/device_specific.h"
+#include "task-spec/profiling.h"
 #include "utils/fmt.h"
 #include "utils/type_index.h"
 
diff --git a/lib/local-execution/include/local-execution/serialization.h b/lib/task-spec/include/task-spec/serialization.h
similarity index 100%
rename from lib/local-execution/include/local-execution/serialization.h
rename to lib/task-spec/include/task-spec/serialization.h
diff --git a/lib/local-execution/include/local-execution/slot_grad_id.struct.toml b/lib/task-spec/include/task-spec/slot_grad_id.struct.toml
similarity index 75%
rename from lib/local-execution/include/local-execution/slot_grad_id.struct.toml
rename to lib/task-spec/include/task-spec/slot_grad_id.struct.toml
index 256091d272..a6533ea884 100644
--- a/lib/local-execution/include/local-execution/slot_grad_id.struct.toml
+++ b/lib/task-spec/include/task-spec/slot_grad_id.struct.toml
@@ -8,8 +8,8 @@ features = [
 ]
 
 includes = [
-  "local-execution/is_grad.dtg.h",
-  "local-execution/slot_id_t.dtg.h",
+  "task-spec/is_grad.dtg.h",
+  "task-spec/slot_id_t.dtg.h",
 ]
 
 [[fields]]
diff --git a/lib/local-execution/include/local-execution/slot_id_t.struct.toml b/lib/task-spec/include/task-spec/slot_id_t.struct.toml
similarity index 100%
rename from lib/local-execution/include/local-execution/slot_id_t.struct.toml
rename to lib/task-spec/include/task-spec/slot_id_t.struct.toml
diff --git a/lib/local-execution/include/local-execution/slot_tensor_type_id.struct.toml b/lib/task-spec/include/task-spec/slot_tensor_type_id.struct.toml
similarity index 76%
rename from lib/local-execution/include/local-execution/slot_tensor_type_id.struct.toml
rename to lib/task-spec/include/task-spec/slot_tensor_type_id.struct.toml
index b3b3a320c7..ab5b981637 100644
--- a/lib/local-execution/include/local-execution/slot_tensor_type_id.struct.toml
+++ b/lib/task-spec/include/task-spec/slot_tensor_type_id.struct.toml
@@ -8,8 +8,8 @@ features = [
 ]
 
 includes = [
-  "local-execution/tensor_type.dtg.h",
-  "local-execution/slot_id_t.dtg.h",
+  "task-spec/tensor_type.dtg.h",
+  "task-spec/slot_id_t.dtg.h",
 ]
 
 [[fields]]
diff --git a/lib/local-execution/include/local-execution/slot_type.enum.toml b/lib/task-spec/include/task-spec/slot_type.enum.toml
similarity index 100%
rename from lib/local-execution/include/local-execution/slot_type.enum.toml
rename to lib/task-spec/include/task-spec/slot_type.enum.toml
diff --git a/lib/local-execution/include/local-execution/task_arg_spec.variant.toml b/lib/task-spec/include/task-spec/task_arg_spec.variant.toml
similarity index 77%
rename from lib/local-execution/include/local-execution/task_arg_spec.variant.toml
rename to lib/task-spec/include/task-spec/task_arg_spec.variant.toml
index 271e3b73d6..0f81f93405 100644
--- a/lib/local-execution/include/local-execution/task_arg_spec.variant.toml
+++ b/lib/task-spec/include/task-spec/task_arg_spec.variant.toml
@@ -7,8 +7,8 @@ features = [
 ]
 
 includes = [
-  "local-execution/concrete_arg.h",
-  "local-execution/runtime_arg_ref.h"
+  "task-spec/concrete_arg.h",
+  "task-spec/runtime_arg_ref.h"
 ]
 
 [[values]]
diff --git a/lib/local-execution/include/local-execution/task_binding.h b/lib/task-spec/include/task-spec/task_binding.h
similarity index 82%
rename from lib/local-execution/include/local-execution/task_binding.h
rename to lib/task-spec/include/task-spec/task_binding.h
index aba0c01a65..a945fec1d7 100644
--- a/lib/local-execution/include/local-execution/task_binding.h
+++ b/lib/task-spec/include/task-spec/task_binding.h
@@ -1,15 +1,15 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H
 #define _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H
 
-#include "local-execution/loss_tensor_t.dtg.h"
-#include "local-execution/lowered_tensor_t.dtg.h"
-#include "local-execution/optimizer_tensor_t.dtg.h"
-#include "local-execution/slot_id_t.dtg.h"
-#include "local-execution/slot_tensor_type_id.dtg.h"
-#include "local-execution/task_arg_spec.dtg.h"
-#include "local-execution/task_id_t.dtg.h"
-#include "local-execution/task_signature.dtg.h"
-#include "local-execution/tensor_type_t.dtg.h"
+#include "task-spec/loss_tensor_t.dtg.h"
+#include "task-spec/lowered_tensor_t.dtg.h"
+#include "task-spec/optimizer_tensor_t.dtg.h"
+#include "task-spec/slot_id_t.dtg.h"
+#include "task-spec/slot_tensor_type_id.dtg.h"
+#include "task-spec/task_arg_spec.dtg.h"
+#include "task-spec/task_id_t.dtg.h"
+#include "task-spec/task_signature.dtg.h"
+#include "task-spec/tensor_type_t.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/task_id_t.enum.toml b/lib/task-spec/include/task-spec/task_id_t.enum.toml
similarity index 100%
rename from lib/local-execution/include/local-execution/task_id_t.enum.toml
rename to lib/task-spec/include/task-spec/task_id_t.enum.toml
diff --git a/lib/local-execution/include/local-execution/task_invocation.h b/lib/task-spec/include/task-spec/task_invocation.h
similarity index 81%
rename from lib/local-execution/include/local-execution/task_invocation.h
rename to lib/task-spec/include/task-spec/task_invocation.h
index d03d6ac8e1..85940091a1 100644
--- a/lib/local-execution/include/local-execution/task_invocation.h
+++ b/lib/task-spec/include/task-spec/task_invocation.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_INVOCATION_H
 #define _FLEXFLOW_LOCAL_EXECUTION_TASK_INVOCATION_H
 
-#include "local-execution/task_invocation.dtg.h"
+#include "task-spec/task_invocation.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/task_invocation.struct.toml b/lib/task-spec/include/task-spec/task_invocation.struct.toml
similarity index 75%
rename from lib/local-execution/include/local-execution/task_invocation.struct.toml
rename to lib/task-spec/include/task-spec/task_invocation.struct.toml
index c9e1e22ba1..38e02a1370 100644
--- a/lib/local-execution/include/local-execution/task_invocation.struct.toml
+++ b/lib/task-spec/include/task-spec/task_invocation.struct.toml
@@ -7,8 +7,8 @@ features = [
 ]
 
 includes = [
-  "local-execution/task_binding.h",
-  "local-execution/task_id_t.dtg.h"
+  "task-spec/task_binding.h",
+  "task-spec/task_id_t.dtg.h"
 ]
 
 
diff --git a/lib/local-execution/include/local-execution/task_signature.h b/lib/task-spec/include/task-spec/task_signature.h
similarity index 97%
rename from lib/local-execution/include/local-execution/task_signature.h
rename to lib/task-spec/include/task-spec/task_signature.h
index b10edce6d4..8214e7e1b5 100644
--- a/lib/local-execution/include/local-execution/task_signature.h
+++ b/lib/task-spec/include/task-spec/task_signature.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_H
 #define _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_H
 
-#include "local-execution/task_signature.dtg.h"
+#include "task-spec/task_signature.dtg.h"
 #include "utils/type_index.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/include/local-execution/task_signature.struct.toml b/lib/task-spec/include/task-spec/task_signature.struct.toml
similarity index 86%
rename from lib/local-execution/include/local-execution/task_signature.struct.toml
rename to lib/task-spec/include/task-spec/task_signature.struct.toml
index 7efb0c658a..3df0a8cfc7 100644
--- a/lib/local-execution/include/local-execution/task_signature.struct.toml
+++ b/lib/task-spec/include/task-spec/task_signature.struct.toml
@@ -7,8 +7,8 @@ features = [
 ]
 
 includes = [
-  "local-execution/tensor_type_slot_spec.dtg.h",
-  "local-execution/slot_id_t.dtg.h",
+  "task-spec/tensor_type_slot_spec.dtg.h",
+  "task-spec/slot_id_t.dtg.h",
   "<typeindex>",
   "<optional>"
 ]
diff --git a/lib/local-execution/include/local-execution/tensor_role.enum.toml b/lib/task-spec/include/task-spec/tensor_role.enum.toml
similarity index 100%
rename from lib/local-execution/include/local-execution/tensor_role.enum.toml
rename to lib/task-spec/include/task-spec/tensor_role.enum.toml
diff --git a/lib/local-execution/include/local-execution/tensor_type.enum.toml b/lib/task-spec/include/task-spec/tensor_type.enum.toml
similarity index 100%
rename from lib/local-execution/include/local-execution/tensor_type.enum.toml
rename to lib/task-spec/include/task-spec/tensor_type.enum.toml
diff --git a/lib/local-execution/include/local-execution/tensor_type_slot_spec.struct.toml b/lib/task-spec/include/task-spec/tensor_type_slot_spec.struct.toml
similarity index 72%
rename from lib/local-execution/include/local-execution/tensor_type_slot_spec.struct.toml
rename to lib/task-spec/include/task-spec/tensor_type_slot_spec.struct.toml
index ceba809474..26e70a5ef8 100644
--- a/lib/local-execution/include/local-execution/tensor_type_slot_spec.struct.toml
+++ b/lib/task-spec/include/task-spec/tensor_type_slot_spec.struct.toml
@@ -8,9 +8,9 @@ features = [
 ]
 
 includes = [
-  "local-execution/slot_type.dtg.h",
-  "local-execution/slot_id_t.dtg.h",
-  "local-execution/tensor_type.dtg.h",
+  "task-spec/slot_type.dtg.h",
+  "task-spec/slot_id_t.dtg.h",
+  "task-spec/tensor_type.dtg.h",
 ]
 
 [[fields]]
diff --git a/lib/local-execution/include/local-execution/tensor_type_t.variant.toml b/lib/task-spec/include/task-spec/tensor_type_t.variant.toml
similarity index 76%
rename from lib/local-execution/include/local-execution/tensor_type_t.variant.toml
rename to lib/task-spec/include/task-spec/tensor_type_t.variant.toml
index cd3520ee5d..b93ed91081 100644
--- a/lib/local-execution/include/local-execution/tensor_type_t.variant.toml
+++ b/lib/task-spec/include/task-spec/tensor_type_t.variant.toml
@@ -9,9 +9,9 @@ features = [
 
 includes = [
   "pcg/tensor_guid_t.dtg.h",
-  "local-execution/optimizer_tensor_t.dtg.h",
-  "local-execution/gradient_tensor_t.dtg.h",
-  "local-execution/loss_tensor_t.dtg.h"
+  "task-spec/optimizer_tensor_t.dtg.h",
+  "task-spec/gradient_tensor_t.dtg.h",
+  "task-spec/loss_tensor_t.dtg.h"
 ]
 
 [[values]]
diff --git a/lib/local-execution/include/local-execution/variadic_tensor_ref.h b/lib/task-spec/include/task-spec/variadic_tensor_ref.h
similarity index 81%
rename from lib/local-execution/include/local-execution/variadic_tensor_ref.h
rename to lib/task-spec/include/task-spec/variadic_tensor_ref.h
index 56da1bab64..e990fd5366 100644
--- a/lib/local-execution/include/local-execution/variadic_tensor_ref.h
+++ b/lib/task-spec/include/task-spec/variadic_tensor_ref.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_VARIADIC_TENSOR_ARG_REF_H
 #define _FLEXFLOW_LOCAL_EXECUTION_VARIADIC_TENSOR_ARG_REF_H
 
-#include "local-execution/arg_ref.h"
-#include "local-execution/op_tensor_spec.h"
+#include "task-spec/arg_ref.h"
+#include "task-spec/op_tensor_spec.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/concrete_arg.cc b/lib/task-spec/src/concrete_arg.cc
similarity index 94%
rename from lib/local-execution/src/concrete_arg.cc
rename to lib/task-spec/src/concrete_arg.cc
index 450d663e17..b67b74b19a 100644
--- a/lib/local-execution/src/concrete_arg.cc
+++ b/lib/task-spec/src/concrete_arg.cc
@@ -1,4 +1,4 @@
-#include "local-execution/concrete_arg.h"
+#include "task-spec/concrete_arg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/op_arg_ref.cc b/lib/task-spec/src/op_arg_ref.cc
similarity index 87%
rename from lib/local-execution/src/op_arg_ref.cc
rename to lib/task-spec/src/op_arg_ref.cc
index b3d6e2f1a5..a427117982 100644
--- a/lib/local-execution/src/op_arg_ref.cc
+++ b/lib/task-spec/src/op_arg_ref.cc
@@ -1,4 +1,4 @@
-#include "local-execution/op_arg_ref.h"
+#include "task-spec/op_arg_ref.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/op_arg_spec.cc b/lib/task-spec/src/op_arg_spec.cc
similarity index 83%
rename from lib/local-execution/src/op_arg_spec.cc
rename to lib/task-spec/src/op_arg_spec.cc
index ddf50d9a4e..6e48a7c5f7 100644
--- a/lib/local-execution/src/op_arg_spec.cc
+++ b/lib/task-spec/src/op_arg_spec.cc
@@ -1,4 +1,4 @@
-#include "local-execution/op_arg_spec.h"
+#include "task-spec/op_arg_spec.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/op_task_invocation.cc b/lib/task-spec/src/op_task_invocation.cc
similarity index 97%
rename from lib/local-execution/src/op_task_invocation.cc
rename to lib/task-spec/src/op_task_invocation.cc
index 19c8894b05..d495dd9f92 100644
--- a/lib/local-execution/src/op_task_invocation.cc
+++ b/lib/task-spec/src/op_task_invocation.cc
@@ -1,5 +1,5 @@
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/op_arg_spec.h"
+#include "task-spec/op_task_invocation.h"
+#include "task-spec/op_arg_spec.h"
 #include "utils/containers/contains_key.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/src/op_task_signature.cc b/lib/task-spec/src/op_task_signature.cc
similarity index 99%
rename from lib/local-execution/src/op_task_signature.cc
rename to lib/task-spec/src/op_task_signature.cc
index 932b330453..94ac16d092 100644
--- a/lib/local-execution/src/op_task_signature.cc
+++ b/lib/task-spec/src/op_task_signature.cc
@@ -1,4 +1,4 @@
-#include "local-execution/op_task_signature.h"
+#include "task-spec/op_task_signature.h"
 #include "utils/fmt/optional.h"
 #include "utils/fmt/unordered_map.h"
 #include "utils/fmt/unordered_set.h"
diff --git a/lib/local-execution/src/op_task_to_task_invocation.cc b/lib/task-spec/src/op_task_to_task_invocation.cc
similarity index 98%
rename from lib/local-execution/src/op_task_to_task_invocation.cc
rename to lib/task-spec/src/op_task_to_task_invocation.cc
index 0e04a2adec..f52800a8de 100644
--- a/lib/local-execution/src/op_task_to_task_invocation.cc
+++ b/lib/task-spec/src/op_task_to_task_invocation.cc
@@ -1,4 +1,4 @@
-#include "local-execution/op_task_to_task_invocation.h"
+#include "task-spec/op_task_to_task_invocation.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "pcg/computation_graph.h"
 
diff --git a/lib/local-execution/src/op_tensor_spec.cc b/lib/task-spec/src/op_tensor_spec.cc
similarity index 89%
rename from lib/local-execution/src/op_tensor_spec.cc
rename to lib/task-spec/src/op_tensor_spec.cc
index a3d3e7ddac..1d97e6ae16 100644
--- a/lib/local-execution/src/op_tensor_spec.cc
+++ b/lib/task-spec/src/op_tensor_spec.cc
@@ -1,4 +1,4 @@
-#include "local-execution/op_tensor_spec.h"
+#include "task-spec/op_tensor_spec.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/runtime_arg_ref.cc b/lib/task-spec/src/runtime_arg_ref.cc
similarity index 89%
rename from lib/local-execution/src/runtime_arg_ref.cc
rename to lib/task-spec/src/runtime_arg_ref.cc
index 1f591b4d82..bb4625c113 100644
--- a/lib/local-execution/src/runtime_arg_ref.cc
+++ b/lib/task-spec/src/runtime_arg_ref.cc
@@ -1,5 +1,5 @@
-#include "local-execution/runtime_arg_ref.h"
-#include "local-execution/device_specific.h"
+#include "task-spec/runtime_arg_ref.h"
+#include "task-spec/device_specific.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/task_invocation.cc b/lib/task-spec/src/task_invocation.cc
similarity index 77%
rename from lib/local-execution/src/task_invocation.cc
rename to lib/task-spec/src/task_invocation.cc
index e08c1036da..4ba97f26de 100644
--- a/lib/local-execution/src/task_invocation.cc
+++ b/lib/task-spec/src/task_invocation.cc
@@ -1,4 +1,4 @@
-#include "local-execution/task_invocation.h"
+#include "task-spec/task_invocation.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/task_signature.cc b/lib/task-spec/src/task_signature.cc
similarity index 93%
rename from lib/local-execution/src/task_signature.cc
rename to lib/task-spec/src/task_signature.cc
index 1d57a1fc54..3ac038e8c5 100644
--- a/lib/local-execution/src/task_signature.cc
+++ b/lib/task-spec/src/task_signature.cc
@@ -1,4 +1,4 @@
-#include "local-execution/task_signature.h"
+#include "task-spec/task_signature.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/variadic_tensor_ref.cc b/lib/task-spec/src/variadic_tensor_ref.cc
similarity index 75%
rename from lib/local-execution/src/variadic_tensor_ref.cc
rename to lib/task-spec/src/variadic_tensor_ref.cc
index efd43a6648..564e58ba95 100644
--- a/lib/local-execution/src/variadic_tensor_ref.cc
+++ b/lib/task-spec/src/variadic_tensor_ref.cc
@@ -1,4 +1,4 @@
-#include "local-execution/variadic_tensor_ref.h"
+#include "task-spec/variadic_tensor_ref.h"
 
 namespace FlexFlow {
 

From 639c2c1c6b2cd3efe76e2c62b6925e9bcc24b817 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Wed, 5 Feb 2025 18:10:59 -0800
Subject: [PATCH 39/91] Delete outdated sim environment file

---
 .../include/local-execution/sim_environment.h | 119 ------------------
 1 file changed, 119 deletions(-)
 delete mode 100644 lib/local-execution/include/local-execution/sim_environment.h

diff --git a/lib/local-execution/include/local-execution/sim_environment.h b/lib/local-execution/include/local-execution/sim_environment.h
deleted file mode 100644
index 6c2f8d4ebb..0000000000
--- a/lib/local-execution/include/local-execution/sim_environment.h
+++ /dev/null
@@ -1,119 +0,0 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_SIM_ENVIRONMENT_H
-#define _FLEXFLOW_LOCAL_EXECUTION_SIM_ENVIRONMENT_H
-
-#include "kernels/accessor.h"
-#include "kernels/allocation.h"
-#include "local-execution/cost_metrics.h"
-#include "local-execution/task_argument_accessor.h"
-#include "local-execution/task_signature_impl.h"
-#include "op-attrs/parallel_tensor_shape.dtg.h"
-#include "pcg/machine_view.h"
-#include "task-spec/op_task_invocation.h"
-#include <vector>
-
-namespace FlexFlow {
-
-struct InputParallelTensorDesc {
-public:
-  ParallelTensorShape shape;
-  IsTrainable trainable;
-};
-
-struct InputVariadicParallelTensorDesc {
-public:
-  std::vector<ParallelTensorShape> shapes;
-  IsTrainable trainable;
-};
-
-struct SimTaskBinding {
-  void bind(slot_id_t, ParallelTensorShape const &);
-  void bind_untrainable(slot_id_t, ParallelTensorShape const &);
-  void bind(slot_id_t, ParallelTensorShape const &, IsTrainable);
-  void bind(slot_id_t, InputParallelTensorDesc const &);
-
-  void bind(slot_id_t, std::vector<ParallelTensorShape> const &);
-  void bind_untrainable(slot_id_t, std::vector<ParallelTensorShape> const &);
-  void bind(slot_id_t, std::vector<ParallelTensorShape> const &, IsTrainable);
-  void bind(slot_id_t, InputVariadicParallelTensorDesc const &);
-
-  template <typename T>
-  void bind_arg(slot_id_t, T const &);
-};
-
-SimTaskBinding infer_bwd_binding(SimTaskBinding const &);
-
-struct SimEnvironment {
-  TaskArgumentAccessor get_init_accessor(task_id_t, SimTaskBinding const &);
-  TaskArgumentAccessor get_fwd_accessor(task_id_t, SimTaskBinding const &);
-  TaskArgumentAccessor get_bwd_accessor(task_id_t, SimTaskBinding const &);
-};
-
-struct SimEnvFactory {
-  SimEnvironment new_environment() const;
-};
-
-GenericTensorAccessorW allocate_input(SimEnvironment &sim, TensorShape const &);
-GenericTensorAccessorW allocate_input(SimEnvironment &sim,
-                                      ParallelTensorShape const &);
-std::vector<GenericTensorAccessorW>
-    allocate_input(SimEnvironment &sim,
-                   std::vector<ParallelTensorShape> const &);
-
-GenericTensorAccessorW allocate_weight(SimEnvironment &sim,
-                                       TensorShape const &);
-GenericTensorAccessorW allocate_weight(SimEnvironment &sim,
-                                       ParallelTensorShape const &);
-std::vector<GenericTensorAccessorW>
-    allocate_weight(SimEnvironment &sim,
-                    std::vector<ParallelTensorShape> const &);
-
-GenericTensorAccessorW allocate_output(SimEnvironment &sim,
-                                       TensorShape const &);
-GenericTensorAccessorW allocate_output(SimEnvironment &sim,
-                                       ParallelTensorShape const &);
-std::vector<GenericTensorAccessorW>
-    allocate_output(SimEnvironment &sim,
-                    std::vector<ParallelTensorShape> const &);
-
-GenericTensorAccessorW allocate_input_grad(SimEnvironment &sim,
-                                           TensorShape const &);
-GenericTensorAccessorW allocate_input_grad(SimEnvironment &sim,
-                                           ParallelTensorShape const &);
-std::vector<GenericTensorAccessorW>
-    allocate_input_grad(SimEnvironment &sim,
-                        std::vector<ParallelTensorShape> const &);
-
-GenericTensorAccessorW allocate_weight_grad(SimEnvironment &sim,
-                                            TensorShape const &);
-GenericTensorAccessorW allocate_weight_grad(SimEnvironment &sim,
-                                            ParallelTensorShape const &);
-std::vector<GenericTensorAccessorW>
-    allocate_weight_grad(SimEnvironment &sim,
-                         std::vector<ParallelTensorShape> const &);
-
-GenericTensorAccessorW allocate_output_grad(SimEnvironment &sim,
-                                            TensorShape const &);
-GenericTensorAccessorW allocate_output_grad(SimEnvironment &sim,
-                                            ParallelTensorShape const &);
-std::vector<GenericTensorAccessorW>
-    allocate_output_grad(SimEnvironment &sim,
-                         std::vector<ParallelTensorShape> const &);
-
-Allocator create_allocator(SimEnvironment &sim);
-PerDeviceFFHandle get_ff_handle(SimEnvironment &sim);
-
-size_t get_input_memory_usage(SimEnvironment const &);
-size_t get_output_memory_usage(SimEnvironment const &);
-size_t get_weights_memory_usage(SimEnvironment const &);
-size_t get_op_total_memory(SimEnvironment const &);
-
-CostMetrics make_metrics(float forward_time,
-                         float backward_time,
-                         float sync_time,
-                         SimEnvironment const &);
-
-float default_estimate_sync_time(SimEnvironment const &);
-
-} // namespace FlexFlow
-
-#endif

From a697044c4bbecae53357114d4d9c8cae12f46793 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Thu, 13 Feb 2025 07:06:35 -0800
Subject: [PATCH 40/91] Finish API

---
 .../local-execution/allocated_tensors.h       |  23 ++
 .../allocated_tensors.struct.toml             |  30 ++
 .../local-execution/local_args_backing.h      |  11 +-
 .../local-execution/local_tensor_backing.h    |  67 ++--
 .../local-execution/local_training_backing.h  |  36 +-
 .../local-execution/model_training_instance.h |  15 +-
 .../include/local-execution/task_registry.h   |   9 +-
 .../unallocated_tensors.struct.toml           |  30 ++
 lib/local-execution/src/allocated_tensors.cc  | 128 +++++++
 lib/local-execution/src/local_args_backing.cc |  17 +-
 .../src/local_tensor_backing.cc               | 315 ++++++++----------
 .../src/local_training_backing.cc             | 173 ++++++----
 .../src/model_training_instance.cc            |  27 +-
 lib/local-execution/src/task_registry.cc      |  81 +++--
 lib/pcg/include/pcg/computation_graph.h       |   4 +
 lib/pcg/src/pcg/computation_graph.cc          |   7 +
 .../task-spec/op_task_to_task_invocation.h    |   7 +-
 .../src/op_task_to_task_invocation.cc         |  23 +-
 18 files changed, 586 insertions(+), 417 deletions(-)
 create mode 100644 lib/local-execution/include/local-execution/allocated_tensors.h
 create mode 100644 lib/local-execution/include/local-execution/allocated_tensors.struct.toml
 create mode 100644 lib/local-execution/include/local-execution/unallocated_tensors.struct.toml
 create mode 100644 lib/local-execution/src/allocated_tensors.cc

diff --git a/lib/local-execution/include/local-execution/allocated_tensors.h b/lib/local-execution/include/local-execution/allocated_tensors.h
new file mode 100644
index 0000000000..60ee662ba8
--- /dev/null
+++ b/lib/local-execution/include/local-execution/allocated_tensors.h
@@ -0,0 +1,23 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_ALLOCATED_TENSORS_H
+#define _FLEXFLOW_LOCAL_EXECUTION_ALLOCATED_TENSORS_H
+
+#include "local-execution/allocated_tensors.dtg.h"
+#include "pcg/computation_graph.h"
+
+namespace FlexFlow {
+
+bool are_allocated_forward_tensors_valid(AllocatedTensors const &,
+                                         ComputationGraph const &);
+bool are_allocated_gradient_tensors_valid(AllocatedTensors const &,
+                                          ComputationGraph const &);
+bool are_allocated_optimizer_tensors_valid(AllocatedTensors const &,
+                                           ComputationGraph const &);
+
+bool is_allocated_tensor_backing_valid(
+    TensorTypeVariant const &,
+    std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> const &,
+    ArrayShape const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/local-execution/include/local-execution/allocated_tensors.struct.toml b/lib/local-execution/include/local-execution/allocated_tensors.struct.toml
new file mode 100644
index 0000000000..e4be709709
--- /dev/null
+++ b/lib/local-execution/include/local-execution/allocated_tensors.struct.toml
@@ -0,0 +1,30 @@
+namespace = "FlexFlow"
+name = "AllocatedTensors"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "ord"
+]
+
+includes = [
+  "task-spec/tensor_type_t.dtg.h",
+  "kernels/accessor.h"
+]
+
+src_includes = [
+  "utils/hash/unordered_map.h",
+  "utils/fmt/unordered_map.h"
+]
+
+[[fields]]
+name = "tensor_type_backings"
+type = "std::unordered_map<::FlexFlow::TensorTypeVariant, ::FlexFlow::GenericTensorAccessorW>"
+
+[[fields]]
+name = "gradient_mapping"
+type = "std::unordered_map<::FlexFlow::tensor_guid_t, ::FlexFlow::gradient_tensor_t>"
+
+[[fields]]
+name = "optimizer_mapping"
+type = "std::unordered_map<::FlexFlow::tensor_guid_t, std::vector<::FlexFlow::optimizer_tensor_t>>"
diff --git a/lib/local-execution/include/local-execution/local_args_backing.h b/lib/local-execution/include/local-execution/local_args_backing.h
index 4c9ede54fd..e9044dc6fa 100644
--- a/lib/local-execution/include/local-execution/local_args_backing.h
+++ b/lib/local-execution/include/local-execution/local_args_backing.h
@@ -12,18 +12,19 @@
 namespace FlexFlow {
 
 struct LocalArgsBacking {
-  LocalArgsBacking(RuntimeArgConfig const &);
+  LocalArgsBacking(
+      RuntimeArgConfig const &,
+      std::unordered_map<layer_guid_t, DeviceSpecificDeviceStates> const &);
 
 public:
   // arguments
+  RuntimeArgConfig runtime_arg_config;
   std::unordered_map<layer_guid_t, DeviceSpecificDeviceStates>
       per_device_op_states;
-  RuntimeArgConfig runtime_arg_config;
 };
 
-void add_per_device_op_state(LocalArgsBacking &,
-                             layer_guid_t const &,
-                             DeviceSpecificDeviceStates const &);
+LocalArgsBacking
+    make_args_backing_with_empty_device_states(RuntimeArgConfig const &);
 
 std::optional<DeviceSpecificDeviceStates>
     get_per_device_op_state_if_exists(LocalArgsBacking const &,
diff --git a/lib/local-execution/include/local-execution/local_tensor_backing.h b/lib/local-execution/include/local-execution/local_tensor_backing.h
index 9d35373784..86244eab13 100644
--- a/lib/local-execution/include/local-execution/local_tensor_backing.h
+++ b/lib/local-execution/include/local-execution/local_tensor_backing.h
@@ -3,22 +3,19 @@
 #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TENSOR_BACKING_H
 
 #include "kernels/accessor.h"
+#include "local-execution/allocated_tensors.dtg.h"
 #include "local-execution/gradient_tensor_source.h"
 #include "local-execution/local_task_argument_accessor.h"
 #include "local-execution/loss_tensor_source.h"
 #include "local-execution/lowered_tensor_source.h"
 #include "local-execution/optimizer_tensor_source.h"
-#include "op-attrs/tensor_shape.dtg.h"
+#include "local-execution/unallocated_tensors.dtg.h"
 #include "pcg/computation_graph.dtg.h"
 #include "pcg/layer_guid_t.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
-#include "pcg/tensor_guid_t.dtg.h"
-#include "task-spec/loss_tensor_t.dtg.h"
 #include "task-spec/lowered_tensor_t.dtg.h"
-#include "task-spec/optimizer_tensor_t.dtg.h"
 #include "task-spec/task_invocation.dtg.h"
 #include "task-spec/tensor_role.dtg.h"
-#include "task-spec/tensor_type_t.dtg.h"
 
 namespace FlexFlow {
 
@@ -26,18 +23,12 @@ using TensorBackingMap =
     std::unordered_map<lowered_tensor_t, GenericTensorAccessorW>;
 
 struct LocalTensorBacking {
-  LocalTensorBacking() = default;
-  LocalTensorBacking(
-      std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> const
-          &allocated_tensor_backings,
-      std::unordered_set<tensor_guid_t> const &allocated_tensor_guids,
-      std::unordered_map<tensor_guid_t, gradient_tensor_t> const
-          &allocated_gradient_mapping,
-      std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>> const
-          &allocated_optimizer_mapping,
-      std::unordered_set<loss_tensor_t> const &allocated_loss_tensors);
-
-  lowered_tensor_t allocate_tensor(TensorShape const &, Allocator &);
+  LocalTensorBacking(AllocatedTensors const &,
+                     UnallocatedTensors const &,
+                     Allocator const &);
+
+public:
+  GenericTensorAccessorW get_tensor(TensorTypeVariant const &) const;
 
 public:
   // tensors
@@ -55,39 +46,23 @@ struct LocalTensorBacking {
   std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
       tensor_optimizer_mapping;
 
+  Allocator allocator;
+
 private:
-  lowered_tensor_t insert_tensor(GenericTensorAccessorW const &);
+  lowered_tensor_t insert_tensor(TensorTypeVariant const &);
   LoweredTensorSource lowered_tensor_source;
 };
 
-void allocate_tensor_guid(LocalTensorBacking &,
-                          tensor_guid_t const &,
-                          TensorShape const &,
-                          Allocator &);
-void allocate_gradient_tensor(LocalTensorBacking &,
-                              gradient_tensor_t const &,
-                              tensor_guid_t const &,
-                              TensorShape const &,
-                              Allocator &);
-void allocate_optimizer_tensors(LocalTensorBacking &,
-                                std::vector<optimizer_tensor_t> const &,
-                                tensor_guid_t const &,
-                                TensorShape const &,
-                                Allocator &);
-
-void allocate_all_computation_graph_tensors(LocalTensorBacking &,
-                                            GradientTensorSource &,
-                                            ComputationGraph const &,
-                                            Allocator &);
-void allocate_all_optimizer_tensors(LocalTensorBacking &,
-                                    OptimizerTensorSource &,
-                                    ComputationGraph const &,
-                                    Allocator &,
-                                    OptimizerAttrs const &);
-loss_tensor_t allocate_loss_tensor(LocalTensorBacking &,
-                                   LossTensorSource const &,
-                                   TensorShape const &,
-                                   Allocator &);
+UnallocatedTensors generate_unallocated_tensors(AllocatedTensors const &,
+                                                ComputationGraph const &,
+                                                GradientTensorSource &);
+
+UnallocatedTensors
+    generate_unallocated_tensors_with_optimizer(AllocatedTensors const &,
+                                                ComputationGraph const &,
+                                                GradientTensorSource &,
+                                                OptimizerTensorSource &,
+                                                OptimizerAttrs const &);
 
 TensorSlotsBacking construct_tensor_slots_backing(LocalTensorBacking const &,
                                                   TaskBinding const &);
diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h
index ef5e7ec41e..b61d20c232 100644
--- a/lib/local-execution/include/local-execution/local_training_backing.h
+++ b/lib/local-execution/include/local-execution/local_training_backing.h
@@ -1,6 +1,7 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H
 #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H
 
+#include "local-execution/allocated_tensors.dtg.h"
 #include "local-execution/local_args_backing.h"
 #include "local-execution/local_tensor_backing.h"
 #include "local-execution/optimizer_tensor_source.h"
@@ -13,46 +14,51 @@ namespace FlexFlow {
 
 struct LocalTrainingBacking {
   LocalTrainingBacking(Allocator const &,
+                       AllocatedTensors const &,
                        ComputationGraph const &,
-                       LocalTensorBacking const &,
-                       LocalArgsBacking const &);
+                       RuntimeArgConfig const &);
+
+  LocalTrainingBacking(Allocator const &,
+                       AllocatedTensors const &,
+                       ComputationGraph const &,
+                       RuntimeArgConfig const &,
+                       OptimizerAttrs const &);
 
 public:
   LocalTensorBacking local_tensor_backing;
   LocalArgsBacking local_args_backing;
 
-  Allocator allocator;
   ComputationGraph computation_graph;
   TaskRegistry task_registry;
 
   GradientTensorSource gradient_tensor_source;
+  OptimizerTensorSource optimizer_tensor_source;
 };
 
-DeviceSpecificDeviceStates call_init_task_impl(TaskRegistry const &,
-                                               task_id_t task_id,
-                                               TaskArgumentAccessor const &acc);
+LocalArgsBacking initialize_args_backing(TaskRegistry const &,
+                                         ComputationGraph const &,
+                                         RuntimeArgConfig const &,
+                                         LocalTensorBacking const &);
 
 std::optional<float> call_task_impl(TaskRegistry const &,
-                                    task_id_t task_id,
-                                    TaskArgumentAccessor acc);
+                                    task_id_t const &task_id,
+                                    TaskArgumentAccessor const &acc);
 
-void execute_init(LocalTrainingBacking &, layer_guid_t const &);
-std::optional<float> execute_forward(LocalTrainingBacking &,
+std::optional<float> execute_forward(LocalTrainingBacking const &,
                                      layer_guid_t const &);
-std::optional<float> execute_backward(LocalTrainingBacking &,
+std::optional<float> execute_backward(LocalTrainingBacking const &,
                                       layer_guid_t const &);
-void compute_loss(LocalTrainingBacking &,
+void compute_loss(LocalTrainingBacking const &,
                   LossAttrs const &,
                   tensor_guid_t const &logit_tensor,
                   loss_tensor_t const &label_tensor);
-void execute_update(LocalTrainingBacking &,
+void execute_update(LocalTrainingBacking const &,
                     layer_guid_t const &,
                     OptimizerAttrs const &);
 
 TaskArgumentAccessor get_task_arg_accessor(LocalTensorBacking const &,
                                            LocalArgsBacking const &,
-                                           TaskInvocation const &,
-                                           Allocator &);
+                                           TaskInvocation const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h
index c264418abc..99a1bd5a9a 100644
--- a/lib/local-execution/include/local-execution/model_training_instance.h
+++ b/lib/local-execution/include/local-execution/model_training_instance.h
@@ -14,24 +14,19 @@ using PerLayerElapsedTime =
 struct ModelTrainingInstance {
   ModelTrainingInstance(LocalTrainingBacking const &,
                         tensor_guid_t const &logit_tensor,
-                        TensorShape const &label_tensor_shape,
+                        loss_tensor_t const &label_tensor,
                         LossAttrs const &,
                         OptimizerAttrs const &);
 
   LocalTrainingBacking training_backing;
-  LossAttrs loss_attrs;
-  OptimizerAttrs optimizer_attrs;
   tensor_guid_t logit_tensor;
   loss_tensor_t label_tensor;
-
-private:
-  OptimizerTensorSource optimizer_tensor_source;
-  LossTensorSource loss_tensor_source;
+  LossAttrs loss_attrs;
+  OptimizerAttrs optimizer_attrs;
 };
 
-void init(ModelTrainingInstance &);
-PerLayerElapsedTime forward(ModelTrainingInstance &);
-PerLayerElapsedTime backward(ModelTrainingInstance &);
+PerLayerElapsedTime forward(ModelTrainingInstance const &);
+PerLayerElapsedTime backward(ModelTrainingInstance const &);
 void update(ModelTrainingInstance &);
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/include/local-execution/task_registry.h b/lib/local-execution/include/local-execution/task_registry.h
index 22cc344b3d..56e98ba8da 100644
--- a/lib/local-execution/include/local-execution/task_registry.h
+++ b/lib/local-execution/include/local-execution/task_registry.h
@@ -9,19 +9,12 @@
 
 namespace FlexFlow {
 
-TaskRegistry empty_task_registry();
-
-void register_tasks_for_layer(TaskRegistry &,
-                              layer_guid_t const &,
-                              ComputationGraphOpAttrs const &attrs);
+TaskRegistry construct_task_registry(ComputationGraph const &);
 
 bool registry_contains_task_for_layer(TaskRegistry const &,
                                       layer_guid_t const &,
                                       OpTaskType const &);
 
-void register_all_computation_graph_tasks(TaskRegistry &,
-                                          ComputationGraph const &);
-
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/local-execution/include/local-execution/unallocated_tensors.struct.toml b/lib/local-execution/include/local-execution/unallocated_tensors.struct.toml
new file mode 100644
index 0000000000..87abf83d13
--- /dev/null
+++ b/lib/local-execution/include/local-execution/unallocated_tensors.struct.toml
@@ -0,0 +1,30 @@
+namespace = "FlexFlow"
+name = "UnallocatedTensors"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+  "ord"
+]
+
+includes = [
+  "task-spec/tensor_type_t.dtg.h",
+  "op-attrs/tensor_shape.dtg.h"
+]
+
+src_includes = [
+  "utils/hash/unordered_map.h",
+  "utils/fmt/unordered_map.h"
+]
+
+[[fields]]
+name = "tensor_type_shapes"
+type = "std::unordered_map<::FlexFlow::TensorTypeVariant, ::FlexFlow::TensorShape>"
+
+[[fields]]
+name = "gradient_mapping"
+type = "std::unordered_map<::FlexFlow::tensor_guid_t, ::FlexFlow::gradient_tensor_t>"
+
+[[fields]]
+name = "optimizer_mapping"
+type = "std::unordered_map<::FlexFlow::tensor_guid_t, std::vector<::FlexFlow::optimizer_tensor_t>>"
diff --git a/lib/local-execution/src/allocated_tensors.cc b/lib/local-execution/src/allocated_tensors.cc
new file mode 100644
index 0000000000..e64db0cfff
--- /dev/null
+++ b/lib/local-execution/src/allocated_tensors.cc
@@ -0,0 +1,128 @@
+#include "local-execution/allocated_tensors.h"
+#include "pcg/optimizer_attrs.h"
+#include "utils/containers/keys.h"
+#include "utils/containers/set_union.h"
+
+namespace FlexFlow {
+
+bool is_allocated_tensor_backing_valid(
+    TensorTypeVariant const &tensor_type,
+    std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> const
+        &allocated_tensor_backings,
+    ArrayShape const &expected_shape) {
+  if (allocated_tensor_backings.count(tensor_type)) {
+    GenericTensorAccessorW tensor_backing =
+        allocated_tensor_backings.at(tensor_type);
+    if (expected_shape == tensor_backing.shape) {
+      return true;
+    }
+  }
+  return false;
+};
+
+bool are_allocated_forward_tensors_valid(
+    AllocatedTensors const &allocated_tensors,
+    ComputationGraph const &computation_graph) {
+  std::unordered_set<tensor_guid_t> all_tensor_guids =
+      set_union(keys(allocated_tensors.gradient_mapping),
+                keys(allocated_tensors.optimizer_mapping));
+  for (tensor_guid_t const &tensor_guid : all_tensor_guids) {
+    TensorAttrs expected_tensor_attrs =
+        get_tensor_attrs(computation_graph, tensor_guid);
+    if (!is_allocated_tensor_backing_valid(
+            TensorTypeVariant{tensor_guid},
+            allocated_tensors.tensor_type_backings,
+            ArrayShape{expected_tensor_attrs.shape})) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool are_allocated_gradient_tensors_valid(
+    AllocatedTensors const &allocated_tensors,
+    ComputationGraph const &computation_graph) {
+  std::unordered_set<TensorTypeVariant>
+      tensors_in_mappings; // will check whether any dangling gradient tensors
+                           // were allocated
+
+  for (std::pair<tensor_guid_t, gradient_tensor_t> const &tensor_to_grad :
+       allocated_tensors.gradient_mapping) {
+    TensorAttrs expected_tensor_attrs =
+        get_tensor_attrs(computation_graph, tensor_to_grad.first);
+    if (expected_tensor_attrs.create_gradients == CreateGrad::NO) {
+      return false;
+    }
+
+    ArrayShape tensor_guid_array_shape =
+        allocated_tensors.tensor_type_backings
+            .at(TensorTypeVariant{tensor_to_grad.first})
+            .shape;
+    TensorTypeVariant gradient_tensor =
+        TensorTypeVariant{tensor_to_grad.second};
+    if (is_allocated_tensor_backing_valid(
+            gradient_tensor,
+            allocated_tensors.tensor_type_backings,
+            tensor_guid_array_shape)) {
+      tensors_in_mappings.insert(gradient_tensor);
+    } else {
+      return false;
+    }
+  }
+
+  for (TensorTypeVariant const &tensor_type :
+       keys(allocated_tensors.tensor_type_backings)) {
+    if (tensor_type.has<gradient_tensor_t>()) {
+      if (!tensors_in_mappings.count(tensor_type)) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+bool are_allocated_optimizer_tensors_valid(
+    AllocatedTensors const &allocated_tensors,
+    ComputationGraph const &computation_graph) {
+  std::unordered_set<TensorTypeVariant>
+      tensors_in_mappings; // will check whether any dangling optimizer tensors
+                           // were allocated
+
+  for (std::pair<tensor_guid_t, std::vector<optimizer_tensor_t>> const
+           &tensor_to_optimizers : allocated_tensors.optimizer_mapping) {
+    TensorAttrs expected_tensor_attrs =
+        get_tensor_attrs(computation_graph, tensor_to_optimizers.first);
+    if (expected_tensor_attrs.create_gradients == CreateGrad::NO) {
+      return false;
+    }
+
+    ArrayShape tensor_guid_array_shape =
+        allocated_tensors.tensor_type_backings
+            .at(TensorTypeVariant{tensor_to_optimizers.first})
+            .shape;
+    for (optimizer_tensor_t const &optimizer_tensor :
+         tensor_to_optimizers.second) {
+      if (is_allocated_tensor_backing_valid(
+              TensorTypeVariant{optimizer_tensor},
+              allocated_tensors.tensor_type_backings,
+              tensor_guid_array_shape)) {
+        tensors_in_mappings.insert(TensorTypeVariant{optimizer_tensor});
+      } else {
+        return false;
+      }
+    }
+  }
+
+  for (TensorTypeVariant const &tensor_type :
+       keys(allocated_tensors.tensor_type_backings)) {
+    if (tensor_type.has<optimizer_tensor_t>()) {
+      if (!tensors_in_mappings.count(tensor_type)) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/local_args_backing.cc b/lib/local-execution/src/local_args_backing.cc
index 715a96efa6..4a342767b2 100644
--- a/lib/local-execution/src/local_args_backing.cc
+++ b/lib/local-execution/src/local_args_backing.cc
@@ -7,15 +7,18 @@
 
 namespace FlexFlow {
 
-LocalArgsBacking::LocalArgsBacking(RuntimeArgConfig const &runtime_arg_config)
-    : runtime_arg_config(runtime_arg_config){};
-
-void add_per_device_op_state(LocalArgsBacking &local_args_backing,
-                             layer_guid_t const &op_guid,
-                             DeviceSpecificDeviceStates const &device_state) {
-  local_args_backing.per_device_op_states.insert({op_guid, device_state});
+LocalArgsBacking make_args_backing_with_empty_device_states(
+    RuntimeArgConfig const &runtime_arg_config) {
+  return LocalArgsBacking{runtime_arg_config, {}};
 }
 
+LocalArgsBacking::LocalArgsBacking(
+    RuntimeArgConfig const &runtime_arg_config,
+    std::unordered_map<layer_guid_t, DeviceSpecificDeviceStates> const
+        &device_states)
+    : runtime_arg_config(runtime_arg_config),
+      per_device_op_states(device_states){};
+
 std::optional<DeviceSpecificDeviceStates> get_per_device_op_state_if_exists(
     LocalArgsBacking const &local_args_backing,
     layer_guid_t const &layer_guid) {
diff --git a/lib/local-execution/src/local_tensor_backing.cc b/lib/local-execution/src/local_tensor_backing.cc
index 00c170d501..67bbd59c3b 100644
--- a/lib/local-execution/src/local_tensor_backing.cc
+++ b/lib/local-execution/src/local_tensor_backing.cc
@@ -1,6 +1,7 @@
 #include "local-execution/local_tensor_backing.h"
 #include "task-spec/slot_grad_id.dtg.h"
 
+#include "local-execution/allocated_tensors.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "pcg/computation_graph.h"
 #include "pcg/optimizer_attrs.h"
@@ -11,195 +12,177 @@
 namespace FlexFlow {
 
 LocalTensorBacking::LocalTensorBacking(
-    std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> const
-        &allocated_tensor_backings,
-    std::unordered_set<tensor_guid_t> const &allocated_tensor_guids,
-    std::unordered_map<tensor_guid_t, gradient_tensor_t> const
-        &allocated_gradient_mapping,
-    std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>> const
-        &allocated_optimizer_mapping,
-    std::unordered_set<loss_tensor_t> const &allocated_loss_tensors)
-    : tensor_gradient_mapping(allocated_gradient_mapping),
-      tensor_optimizer_mapping(allocated_optimizer_mapping) {
-
-  // computation graph tensors
-  for (tensor_guid_t const &allocated_tensor_guid : allocated_tensor_guids) {
-    lowered_tensor_t lowered_tensor = this->insert_tensor(
-        allocated_tensor_backings.at(TensorTypeVariant{allocated_tensor_guid}));
-    this->tensor_lowering_mapping.insert(
-        {allocated_tensor_guid, lowered_tensor});
-  }
-
-  // gradient tensors
-  for (std::pair<tensor_guid_t, gradient_tensor_t> const
-           &tensor_guid_gradient_pair : allocated_gradient_mapping) {
-    gradient_tensor_t allocated_gradient_tensor =
-        tensor_guid_gradient_pair.second;
+    AllocatedTensors const &allocated_tensors,
+    UnallocatedTensors const &unallocated_tensors,
+    Allocator const &allocator)
+    : tensor_gradient_mapping(allocated_tensors.gradient_mapping),
+      tensor_optimizer_mapping(allocated_tensors.optimizer_mapping),
+      allocator(allocator) {
+
+  // handle already-allocated tensors
+  for (std::pair<TensorTypeVariant, GenericTensorAccessorW> const
+           &tensor_type_backing : allocated_tensors.tensor_type_backings) {
     lowered_tensor_t lowered_tensor =
-        this->insert_tensor(allocated_tensor_backings.at(
-            TensorTypeVariant{allocated_gradient_tensor}));
-    this->gradient_tensor_lowering_mapping.insert(
-        {allocated_gradient_tensor, lowered_tensor});
+        this->insert_tensor(tensor_type_backing.first);
+    this->tensor_backings.insert({lowered_tensor, tensor_type_backing.second});
   }
 
-  // optimizer tensors
+  // allocate new tensors
+  this->tensor_gradient_mapping.insert(
+      unallocated_tensors.gradient_mapping.begin(),
+      unallocated_tensors.gradient_mapping.end());
+
   for (std::pair<tensor_guid_t, std::vector<optimizer_tensor_t>> const
-           &tensor_guid_optimizers_pair : allocated_optimizer_mapping) {
-    for (optimizer_tensor_t const &allocated_optimizer_tensor :
-         tensor_guid_optimizers_pair.second) {
-      lowered_tensor_t lowered_tensor =
-          this->insert_tensor(allocated_tensor_backings.at(
-              TensorTypeVariant{allocated_optimizer_tensor}));
-      this->optimizer_tensor_lowering_mapping.insert(
-          {allocated_optimizer_tensor, lowered_tensor});
+           &unallocated_optimizer_tensors :
+       unallocated_tensors.optimizer_mapping) {
+    if (this->tensor_optimizer_mapping.count(
+            unallocated_optimizer_tensors.first)) {
+      for (optimizer_tensor_t const &optimizer_tensor :
+           unallocated_optimizer_tensors.second) {
+        this->tensor_optimizer_mapping[unallocated_optimizer_tensors.first]
+            .push_back(optimizer_tensor);
+      }
+    } else {
+      this->tensor_optimizer_mapping.insert({unallocated_optimizer_tensors});
     }
   }
 
-  // loss tensors
-  for (loss_tensor_t const &allocated_loss_tensor : allocated_loss_tensors) {
-    lowered_tensor_t lowered_tensor = this->insert_tensor(
-        allocated_tensor_backings.at(TensorTypeVariant{allocated_loss_tensor}));
-    this->loss_tensor_lowering_mapping.insert(
-        {allocated_loss_tensor, lowered_tensor});
+  for (std::pair<TensorTypeVariant, TensorShape> const &tensor_type_shape :
+       unallocated_tensors.tensor_type_shapes) {
+    lowered_tensor_t lowered_tensor =
+        this->insert_tensor(tensor_type_shape.first);
+    GenericTensorAccessorW tensor_backing =
+        this->allocator.allocate_tensor(tensor_type_shape.second);
+    this->tensor_backings.insert({lowered_tensor, tensor_backing});
   }
-
-  // sanity check that backings match up with the mappings
-  assert(this->tensor_backings.size() == allocated_tensor_backings.size());
 };
 
-lowered_tensor_t LocalTensorBacking::insert_tensor(
-    GenericTensorAccessorW const &tensor_backing) {
+lowered_tensor_t
+    LocalTensorBacking::insert_tensor(TensorTypeVariant const &tensor_type) {
   lowered_tensor_t lowered_tensor =
       this->lowered_tensor_source.new_lowered_tensor();
-  this->tensor_backings.insert({lowered_tensor, tensor_backing});
+  tensor_type.visit<void>(overload{
+      [&](tensor_guid_t const &tensor_guid) {
+        this->tensor_lowering_mapping.insert({tensor_guid, lowered_tensor});
+      },
+      [&](gradient_tensor_t const &gradient_tensor) {
+        this->gradient_tensor_lowering_mapping.insert(
+            {gradient_tensor, lowered_tensor});
+      },
+      [&](optimizer_tensor_t const &optimizer_tensor) {
+        this->optimizer_tensor_lowering_mapping.insert(
+            {optimizer_tensor, lowered_tensor});
+      },
+      [&](loss_tensor_t const &loss_tensor) {
+        this->loss_tensor_lowering_mapping.insert(
+            {loss_tensor, lowered_tensor});
+      },
+      [&](auto const &any_tensor) {
+        throw mk_runtime_error(
+            fmt::format("Unhandled tensor type {}", any_tensor));
+      }});
   return lowered_tensor;
 }
 
-lowered_tensor_t
-    LocalTensorBacking::allocate_tensor(TensorShape const &tensor_shape,
-                                        Allocator &allocator) {
-  GenericTensorAccessorW tensor_backing =
-      allocator.allocate_tensor(tensor_shape);
-  return this->insert_tensor(tensor_backing);
+GenericTensorAccessorW
+    LocalTensorBacking::get_tensor(TensorTypeVariant const &tensor_type) const {
+  lowered_tensor_t lowered_tensor = tensor_type.visit<lowered_tensor_t>(
+      overload{[&](tensor_guid_t const &tensor_guid) {
+                 this->tensor_lowering_mapping.at(tensor_guid);
+               },
+               [&](gradient_tensor_t const &gradient_tensor) {
+                 this->gradient_tensor_lowering_mapping.at(gradient_tensor);
+               },
+               [&](optimizer_tensor_t const &optimizer_tensor) {
+                 this->optimizer_tensor_lowering_mapping.at(optimizer_tensor);
+               },
+               [&](loss_tensor_t const &loss_tensor) {
+                 this->loss_tensor_lowering_mapping.at(loss_tensor);
+               },
+               [&](auto const &any_tensor) {
+                 throw mk_runtime_error(
+                     fmt::format("Unhandled tensor type {}", any_tensor));
+               }});
+  return this->tensor_backings.at(lowered_tensor);
 }
 
-void allocate_tensor_guid(LocalTensorBacking &local_tensor_backing,
-                          tensor_guid_t const &tensor_guid,
-                          TensorShape const &tensor_shape,
-                          Allocator &allocator) {
-  if (!contains_key(local_tensor_backing.tensor_lowering_mapping,
-                    tensor_guid)) {
-    lowered_tensor_t lowered_tensor =
-        local_tensor_backing.allocate_tensor(tensor_shape, allocator);
-    local_tensor_backing.tensor_lowering_mapping.insert(
-        {tensor_guid, lowered_tensor});
-  }
-}
-
-void allocate_gradient_tensor(LocalTensorBacking &local_tensor_backing,
-                              gradient_tensor_t const &gradient_tensor,
-                              tensor_guid_t const &tensor_guid,
-                              TensorShape const &tensor_shape,
-                              Allocator &allocator) {
-  if (!contains_key(local_tensor_backing.tensor_gradient_mapping,
-                    tensor_guid)) {
-    local_tensor_backing.tensor_gradient_mapping.insert(
-        {tensor_guid, gradient_tensor});
-    lowered_tensor_t lowered_tensor =
-        local_tensor_backing.allocate_tensor(tensor_shape, allocator);
-    local_tensor_backing.gradient_tensor_lowering_mapping.insert(
-        {gradient_tensor, lowered_tensor});
-  }
-}
-
-void allocate_optimizer_tensors(
-    LocalTensorBacking &local_tensor_backing,
-    std::vector<optimizer_tensor_t> const &optimizer_tensors,
-    tensor_guid_t const &tensor_guid,
-    TensorShape const &tensor_shape,
-    Allocator &allocator) {
-  if (!contains_key(local_tensor_backing.tensor_optimizer_mapping,
-                    tensor_guid)) {
-    // insert new optimizer tensors into mappings
-    std::vector<optimizer_tensor_t> optimizer_tensors;
-    for (optimizer_tensor_t const &optimizer_tensor : optimizer_tensors) {
-      // allocate lowered tensor
-      lowered_tensor_t lowered_tensor =
-          local_tensor_backing.allocate_tensor(tensor_shape, allocator);
-      local_tensor_backing.optimizer_tensor_lowering_mapping.insert(
-          {optimizer_tensor, lowered_tensor});
-    }
-    local_tensor_backing.tensor_optimizer_mapping.insert(
-        {tensor_guid, optimizer_tensors});
-  }
-}
+UnallocatedTensors
+    generate_unallocated_tensors(AllocatedTensors const &allocated_tensors,
+                                 ComputationGraph const &computation_graph,
+                                 GradientTensorSource &gradient_tensor_source) {
+  assert(are_allocated_forward_tensors_valid(allocated_tensors,
+                                             computation_graph));
+  assert(are_allocated_gradient_tensors_valid(allocated_tensors,
+                                              computation_graph));
 
-void allocate_loss_tensor(LocalTensorBacking &local_tensor_backing,
-                          loss_tensor_t const &loss_tensor,
-                          TensorShape const &tensor_shape,
-                          Allocator &allocator) {
-  lowered_tensor_t lowered_tensor =
-      local_tensor_backing.allocate_tensor(tensor_shape, allocator);
-  local_tensor_backing.loss_tensor_lowering_mapping.insert(
-      {loss_tensor, lowered_tensor});
-}
+  std::unordered_map<TensorTypeVariant, TensorShape> tensor_type_shapes;
+  std::unordered_map<tensor_guid_t, gradient_tensor_t> gradient_mapping;
 
-void allocate_all_computation_graph_tensors(
-    LocalTensorBacking &local_tensor_backing,
-    GradientTensorSource &gradient_tensor_source,
-    ComputationGraph const &computation_graph,
-    Allocator &allocator) {
-  // allocate each layer's tensors and gradient tensors
   for (tensor_guid_t const &tensor_guid : get_all_tensors(computation_graph)) {
     TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor_guid);
-    allocate_tensor_guid(
-        local_tensor_backing, tensor_guid, tensor_attrs.shape, allocator);
+    TensorTypeVariant tensor_guid_type = TensorTypeVariant{tensor_guid};
+    if (!allocated_tensors.tensor_type_backings.count(tensor_guid_type)) {
+      tensor_type_shapes.insert({tensor_guid_type, tensor_attrs.shape});
+    }
 
-    if (tensor_attrs.create_gradients == CreateGrad::YES) {
+    if (tensor_attrs.create_gradients == CreateGrad::YES &&
+        !allocated_tensors.gradient_mapping.count(tensor_guid)) {
       gradient_tensor_t gradient_tensor =
           gradient_tensor_source.new_gradient_tensor();
-      allocate_gradient_tensor(local_tensor_backing,
-                               gradient_tensor,
-                               tensor_guid,
-                               tensor_attrs.shape,
-                               allocator);
+      tensor_type_shapes.insert(
+          {TensorTypeVariant{tensor_guid}, tensor_attrs.shape});
+      gradient_mapping.insert({tensor_guid, gradient_tensor});
     }
   }
+
+  return UnallocatedTensors{tensor_type_shapes, gradient_mapping, {}};
 }
 
-void allocate_all_optimizer_tensors(
-    LocalTensorBacking &local_tensor_backing,
-    OptimizerTensorSource &optimizer_tensor_source,
+UnallocatedTensors generate_unallocated_tensors_with_optimizer(
+    AllocatedTensors const &allocated_tensors,
     ComputationGraph const &computation_graph,
-    Allocator &allocator,
+    GradientTensorSource &gradient_tensor_source,
+    OptimizerTensorSource &optimizer_tensor_source,
     OptimizerAttrs const &optimizer_attrs) {
+
+  UnallocatedTensors unallocated_tensors = generate_unallocated_tensors(
+      allocated_tensors, computation_graph, gradient_tensor_source);
+  assert(are_allocated_optimizer_tensors_valid(allocated_tensors,
+                                               computation_graph));
+
+  std::unordered_map<TensorTypeVariant, TensorShape> tensor_type_shapes =
+      unallocated_tensors.tensor_type_shapes;
+  std::unordered_map<tensor_guid_t, gradient_tensor_t> gradient_mapping =
+      unallocated_tensors.gradient_mapping;
+  std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
+      optimizer_mapping;
+
   for (tensor_guid_t const &tensor_guid : get_all_tensors(computation_graph)) {
     TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor_guid);
-    if (tensor_attrs.create_gradients == CreateGrad::YES) {
+    if (tensor_attrs.create_gradients == CreateGrad::YES &&
+        !allocated_tensors.optimizer_mapping.count(tensor_guid)) {
       std::vector<optimizer_tensor_t> optimizer_tensors;
-      for (int i = 0; i < get_num_optimizer_tensors(optimizer_attrs); ++i) {
-        optimizer_tensors.push_back(
-            optimizer_tensor_source.new_optimizer_tensor());
+
+      int num_optimizer_tensors_to_allocate =
+          get_num_optimizer_tensors(optimizer_attrs);
+      if (allocated_tensors.optimizer_mapping.count(tensor_guid)) {
+        num_optimizer_tensors_to_allocate -=
+            allocated_tensors.optimizer_mapping.at(tensor_guid).size();
+      }
+
+      for (int i = 0; i < num_optimizer_tensors_to_allocate; ++i) {
+        optimizer_tensor_t optimizer_tensor =
+            optimizer_tensor_source.new_optimizer_tensor();
+        optimizer_tensors.push_back(optimizer_tensor);
+        tensor_type_shapes.insert(
+            {TensorTypeVariant{optimizer_tensor}, tensor_attrs.shape});
       }
-      allocate_optimizer_tensors(local_tensor_backing,
-                                 optimizer_tensors,
-                                 tensor_guid,
-                                 tensor_attrs.shape,
-                                 allocator);
+      optimizer_mapping.insert({tensor_guid, optimizer_tensors});
     }
   }
-}
 
-loss_tensor_t allocate_loss_tensor(LocalTensorBacking &local_tensor_backing,
-                                   LossTensorSource &loss_tensor_source,
-                                   TensorShape const &tensor_shape,
-                                   Allocator &allocator) {
-  loss_tensor_t loss_tensor = loss_tensor_source.new_loss_tensor();
-  lowered_tensor_t lowered_tensor =
-      local_tensor_backing.allocate_tensor(tensor_shape, allocator);
-  local_tensor_backing.loss_tensor_lowering_mapping.insert(
-      {loss_tensor, lowered_tensor});
-  return loss_tensor;
+  return UnallocatedTensors{
+      tensor_type_shapes, gradient_mapping, optimizer_mapping};
 }
 
 TensorSlotsBacking construct_tensor_slots_backing(
@@ -207,30 +190,10 @@ TensorSlotsBacking construct_tensor_slots_backing(
     TaskBinding const &binding) {
   TensorSlotsBacking mapping;
 
-  for (auto const &tensor_binding : binding.get_tensor_bindings()) {
-    SlotTensorTypeId slot_tensor_type_id = tensor_binding.first;
-
-    lowered_tensor_t lowered_tensor =
-        tensor_binding.second.visit<lowered_tensor_t>(overload{
-            [&](tensor_guid_t const &t) {
-              return local_tensor_backing.tensor_lowering_mapping.at(t);
-            },
-            [&](gradient_tensor_t const &t) {
-              return local_tensor_backing.gradient_tensor_lowering_mapping.at(
-                  t);
-            },
-            [&](optimizer_tensor_t const &t) {
-              return local_tensor_backing.optimizer_tensor_lowering_mapping.at(
-                  t);
-            },
-            [&](loss_tensor_t const &t) {
-              return local_tensor_backing.loss_tensor_lowering_mapping.at(t);
-            },
-        });
-
-    GenericTensorAccessorW accessor =
-        local_tensor_backing.tensor_backings.at(lowered_tensor);
-    mapping.insert({slot_tensor_type_id, accessor});
+  for (std::pair<SlotTensorTypeId, TensorTypeVariant> const &tensor_binding :
+       binding.get_tensor_bindings()) {
+    mapping.insert({tensor_binding.first,
+                    local_tensor_backing.get_tensor(tensor_binding.second)});
   }
 
   return mapping;
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index 2679a502e3..8a0dc825eb 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -2,11 +2,10 @@
 #include "local-execution/loss_functions.h"
 #include "local-execution/optimizer.h"
 #include "local-execution/task_signature_impl.h"
-#include "task-spec/op_task_to_task_invocation.h"
-#include "task-spec/task_invocation.h"
-
 #include "pcg/computation_graph.h"
 #include "pcg/optimizer_attrs.h"
+#include "task-spec/op_task_to_task_invocation.h"
+#include "task-spec/task_invocation.h"
 #include "utils/containers/contains.h"
 #include "utils/containers/contains_key.h"
 #include "utils/containers/get_only.h"
@@ -17,70 +16,93 @@ namespace FlexFlow {
 
 LocalTrainingBacking::LocalTrainingBacking(
     Allocator const &allocator,
+    AllocatedTensors const &allocated_tensors,
     ComputationGraph const &computation_graph,
-    LocalTensorBacking const &local_tensor_backing,
-    LocalArgsBacking const &local_args_backing)
-    : allocator(allocator), computation_graph(computation_graph),
-      task_registry(empty_task_registry()),
-      local_tensor_backing(local_tensor_backing),
-      local_args_backing(local_args_backing) {
-  allocate_all_computation_graph_tensors(this->local_tensor_backing,
-                                         this->gradient_tensor_source,
-                                         this->computation_graph,
-                                         this->allocator);
-  register_all_computation_graph_tasks(this->task_registry,
-                                       this->computation_graph);
-}
+    RuntimeArgConfig const &runtime_arg_config)
+    : computation_graph(computation_graph),
+      task_registry(construct_task_registry(computation_graph)),
+      local_tensor_backing(
+          allocated_tensors,
+          generate_unallocated_tensors(allocated_tensors,
+                                       computation_graph,
+                                       this->gradient_tensor_source),
+          allocator),
+      local_args_backing(initialize_args_backing(this->task_registry,
+                                                 this->computation_graph,
+                                                 runtime_arg_config,
+                                                 this->local_tensor_backing)){};
 
-DeviceSpecificDeviceStates
-    call_init_task_impl(TaskRegistry const &task_registry,
-                        task_id_t task_id,
-                        TaskArgumentAccessor const &acc) {
-  TaskSignatureAndImpl task_sig_impl = task_registry.task_mapping.at(task_id);
-  auto fn =
-      task_sig_impl.impl_function.get<InitOpTaskImplFunction>().function_ptr;
-  return fn(acc);
+LocalTrainingBacking::LocalTrainingBacking(
+    Allocator const &allocator,
+    AllocatedTensors const &allocated_tensors,
+    ComputationGraph const &computation_graph,
+    RuntimeArgConfig const &runtime_arg_config,
+    OptimizerAttrs const &optimizer_attrs)
+    : computation_graph(computation_graph),
+      task_registry(construct_task_registry(computation_graph)),
+      local_tensor_backing(allocated_tensors,
+                           generate_unallocated_tensors_with_optimizer(
+                               allocated_tensors,
+                               computation_graph,
+                               this->gradient_tensor_source,
+                               this->optimizer_tensor_source,
+                               optimizer_attrs),
+                           allocator),
+      local_args_backing(initialize_args_backing(this->task_registry,
+                                                 this->computation_graph,
+                                                 runtime_arg_config,
+                                                 this->local_tensor_backing)){};
+
+LocalArgsBacking
+    initialize_args_backing(TaskRegistry const &task_registry,
+                            ComputationGraph const &cg,
+                            RuntimeArgConfig const &runtime_arg_config,
+                            LocalTensorBacking const &local_tensor_backing) {
+  std::unordered_map<layer_guid_t, DeviceSpecificDeviceStates>
+      per_device_op_states;
+  for (layer_guid_t const &node : topological_ordering(cg)) {
+    if (registry_contains_task_for_layer(
+            task_registry, node, OpTaskType::INIT)) {
+      ComputationGraphOpAttrs attrs = get_layer_attrs(cg, node).attrs;
+
+      TaskInvocation invocation =
+          lower_to_task_invocation(init(attrs),
+                                   node,
+                                   get_incoming_inputs(cg, node),
+                                   get_incoming_input_shapes(cg, node),
+                                   get_outgoing_tensors(cg, node),
+                                   get_incoming_weights(cg, node),
+                                   local_tensor_backing.tensor_gradient_mapping,
+                                   std::nullopt);
+      TaskArgumentAccessor accessor = get_task_arg_accessor(
+          local_tensor_backing,
+          make_args_backing_with_empty_device_states(runtime_arg_config),
+          invocation);
+      TaskSignatureAndImpl task_sig_impl =
+          task_registry.task_mapping.at(invocation.task_id);
+      auto fn = task_sig_impl.impl_function.get<InitOpTaskImplFunction>()
+                    .function_ptr;
+      DeviceSpecificDeviceStates device_state = fn(accessor);
+      per_device_op_states.insert({node, device_state});
+    }
+  }
+
+  return LocalArgsBacking{runtime_arg_config, per_device_op_states};
 }
 
 std::optional<float> call_task_impl(TaskRegistry const &task_registry,
-                                    task_id_t task_id,
-                                    TaskArgumentAccessor acc) {
+                                    task_id_t const &task_id,
+                                    TaskArgumentAccessor const &acc) {
   TaskSignatureAndImpl task_sig_impl = task_registry.task_mapping.at(task_id);
   auto fn =
       task_sig_impl.impl_function.get<FwdBwdOpTaskImplFunction>().function_ptr;
   return fn(acc);
 }
 
-void execute_init(LocalTrainingBacking &local_training_backing,
-                  layer_guid_t const &operator_node) {
-  if (registry_contains_task_for_layer(local_training_backing.task_registry,
-                                       operator_node,
-                                       OpTaskType::INIT)) {
-    ComputationGraphOpAttrs attrs =
-        get_layer_attrs(local_training_backing.computation_graph, operator_node)
-            .attrs;
-
-    TaskInvocation invocation = lower_to_task_invocation(
-        init(attrs),
-        operator_node,
-        local_training_backing.computation_graph,
-        local_training_backing.local_tensor_backing.tensor_gradient_mapping,
-        std::nullopt);
-    TaskArgumentAccessor accessor =
-        get_task_arg_accessor(local_training_backing.local_tensor_backing,
-                              local_training_backing.local_args_backing,
-                              invocation,
-                              local_training_backing.allocator);
-    DeviceSpecificDeviceStates device_state = call_init_task_impl(
-        local_training_backing.task_registry, invocation.task_id, accessor);
-    add_per_device_op_state(
-        local_training_backing.local_args_backing, operator_node, device_state);
-  }
-}
-
 std::optional<float>
-    execute_forward(LocalTrainingBacking &local_training_backing,
-                    layer_guid_t const &operator_node) {
+    execute_forward(LocalTrainingBacking const &local_training_backing,
+                    layer_guid_t const &operator_node,
+                    Allocator &allocator) {
   if (registry_contains_task_for_layer(local_training_backing.task_registry,
                                        operator_node,
                                        OpTaskType::FWD)) {
@@ -94,14 +116,20 @@ std::optional<float>
     TaskInvocation invocation = lower_to_task_invocation(
         forward(attrs),
         operator_node,
-        local_training_backing.computation_graph,
+        get_incoming_inputs(local_training_backing.computation_graph,
+                            operator_node),
+        get_incoming_input_shapes(local_training_backing.computation_graph,
+                                  operator_node),
+        get_outgoing_tensors(local_training_backing.computation_graph,
+                             operator_node),
+        get_incoming_weights(local_training_backing.computation_graph,
+                             operator_node),
         local_training_backing.local_tensor_backing.tensor_gradient_mapping,
         device_state);
     TaskArgumentAccessor accessor =
         get_task_arg_accessor(local_training_backing.local_tensor_backing,
                               local_training_backing.local_args_backing,
-                              invocation,
-                              local_training_backing.allocator);
+                              invocation);
     return call_task_impl(
         local_training_backing.task_registry, invocation.task_id, accessor);
   } else {
@@ -109,7 +137,7 @@ std::optional<float>
   }
 }
 
-void compute_loss(LocalTrainingBacking &local_training_backing,
+void compute_loss(LocalTrainingBacking const &local_training_backing,
                   LossAttrs const &loss_attrs,
                   tensor_guid_t const &logit_tensor,
                   loss_tensor_t const &label_tensor) {
@@ -124,14 +152,13 @@ void compute_loss(LocalTrainingBacking &local_training_backing,
   TaskArgumentAccessor loss_accessor =
       get_task_arg_accessor(local_training_backing.local_tensor_backing,
                             local_training_backing.local_args_backing,
-                            loss_invocation,
-                            local_training_backing.allocator);
+                            loss_invocation);
   TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl();
   loss_impl_fn.get<GenericTaskImplFunction>().function_ptr(loss_accessor);
 }
 
 std::optional<float>
-    execute_backward(LocalTrainingBacking &local_training_backing,
+    execute_backward(LocalTrainingBacking const &local_training_backing,
                      layer_guid_t const &operator_node) {
   if (registry_contains_task_for_layer(local_training_backing.task_registry,
                                        operator_node,
@@ -146,14 +173,20 @@ std::optional<float>
     TaskInvocation invocation = lower_to_task_invocation(
         backward(attrs),
         operator_node,
-        local_training_backing.computation_graph,
+        get_incoming_inputs(local_training_backing.computation_graph,
+                            operator_node),
+        get_incoming_input_shapes(local_training_backing.computation_graph,
+                                  operator_node),
+        get_outgoing_tensors(local_training_backing.computation_graph,
+                             operator_node),
+        get_incoming_weights(local_training_backing.computation_graph,
+                             operator_node),
         local_training_backing.local_tensor_backing.tensor_gradient_mapping,
         device_state);
     TaskArgumentAccessor accessor =
         get_task_arg_accessor(local_training_backing.local_tensor_backing,
                               local_training_backing.local_args_backing,
-                              invocation,
-                              local_training_backing.allocator);
+                              invocation);
     return call_task_impl(
         local_training_backing.task_registry, invocation.task_id, accessor);
   } else {
@@ -161,7 +194,7 @@ std::optional<float>
   }
 }
 
-void execute_update(LocalTrainingBacking &local_training_backing,
+void execute_update(LocalTrainingBacking const &local_training_backing,
                     layer_guid_t const &node,
                     OptimizerAttrs const &optimizer_attrs) {
   LayerAttrs layer_attrs =
@@ -191,8 +224,7 @@ void execute_update(LocalTrainingBacking &local_training_backing,
     TaskArgumentAccessor accessor =
         get_task_arg_accessor(local_training_backing.local_tensor_backing,
                               local_training_backing.local_args_backing,
-                              invocation,
-                              local_training_backing.allocator);
+                              invocation);
     TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs);
     update_impl_fn.get<GenericTaskImplFunction>().function_ptr(accessor);
   }
@@ -201,14 +233,13 @@ void execute_update(LocalTrainingBacking &local_training_backing,
 TaskArgumentAccessor
     get_task_arg_accessor(LocalTensorBacking const &local_tensor_backing,
                           LocalArgsBacking const &local_args_backing,
-                          TaskInvocation const &invocation,
-                          Allocator &allocator) {
+                          TaskInvocation const &invocation) {
   TensorSlotsBacking tensor_slots_backing =
       construct_tensor_slots_backing(local_tensor_backing, invocation.binding);
   ArgSlotsBacking arg_slots_backing = construct_arg_slots_backing(
       invocation.binding, local_args_backing.runtime_arg_config);
   return TaskArgumentAccessor::create<LocalTaskArgumentAccessor>(
-      allocator, tensor_slots_backing, arg_slots_backing);
+      local_tensor_backing.allocator, tensor_slots_backing, arg_slots_backing);
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc
index 98b8851212..4a22937174 100644
--- a/lib/local-execution/src/model_training_instance.cc
+++ b/lib/local-execution/src/model_training_instance.cc
@@ -8,31 +8,15 @@ namespace FlexFlow {
 ModelTrainingInstance::ModelTrainingInstance(
     LocalTrainingBacking const &local_training_backing,
     tensor_guid_t const &logit_tensor,
-    TensorShape const &label_tensor_shape,
+    loss_tensor_t const &label_tensor,
     LossAttrs const &loss_attrs,
     OptimizerAttrs const &optimizer_attrs)
     : training_backing(local_training_backing), loss_attrs(loss_attrs),
       optimizer_attrs(optimizer_attrs), logit_tensor(logit_tensor),
-      label_tensor(
-          allocate_loss_tensor(this->training_backing.local_tensor_backing,
-                               this->loss_tensor_source,
-                               label_tensor_shape,
-                               this->training_backing.allocator)) {
-  allocate_all_optimizer_tensors(this->training_backing.local_tensor_backing,
-                                 this->optimizer_tensor_source,
-                                 this->training_backing.computation_graph,
-                                 this->training_backing.allocator,
-                                 this->optimizer_attrs);
-}
-
-void init(ModelTrainingInstance &model_training_instance) {
-  for (layer_guid_t const &node : topological_ordering(
-           model_training_instance.training_backing.computation_graph)) {
-    execute_init(model_training_instance.training_backing, node);
-  }
-}
+      label_tensor(label_tensor){};
 
-PerLayerElapsedTime forward(ModelTrainingInstance &model_training_instance) {
+PerLayerElapsedTime
+    forward(ModelTrainingInstance const &model_training_instance) {
   PerLayerElapsedTime per_layer_elapsed_time;
   for (layer_guid_t const &node : topological_ordering(
            model_training_instance.training_backing.computation_graph)) {
@@ -43,7 +27,8 @@ PerLayerElapsedTime forward(ModelTrainingInstance &model_training_instance) {
   return per_layer_elapsed_time;
 }
 
-PerLayerElapsedTime backward(ModelTrainingInstance &model_training_instance) {
+PerLayerElapsedTime
+    backward(ModelTrainingInstance const &model_training_instance) {
   compute_loss(model_training_instance.training_backing,
                model_training_instance.loss_attrs,
                model_training_instance.logit_tensor,
diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc
index 7b0c80a9bc..f33aef8460 100644
--- a/lib/local-execution/src/task_registry.cc
+++ b/lib/local-execution/src/task_registry.cc
@@ -4,44 +4,49 @@
 
 namespace FlexFlow {
 
-TaskRegistry empty_task_registry() {
-  return TaskRegistry{{}, {}, {}, {}};
-}
+TaskRegistry construct_task_registry(ComputationGraph const &cg) {
+  std::unordered_map<layer_guid_t, std::optional<task_id_t>> init_task_ids;
+  std::unordered_map<layer_guid_t, std::optional<task_id_t>> fwd_task_ids;
+  std::unordered_map<layer_guid_t, std::optional<task_id_t>> bwd_task_ids;
+
+  std::unordered_map<task_id_t, TaskSignatureAndImpl> task_mapping;
+
+  for (layer_guid_t const &node : topological_ordering(cg)) {
+    init_task_ids.insert({node, std::nullopt});
+    fwd_task_ids.insert({node, std::nullopt});
+    bwd_task_ids.insert({node, std::nullopt});
 
-void register_tasks_for_layer(TaskRegistry &task_registry,
-                              layer_guid_t const &op_id,
-                              ComputationGraphOpAttrs const &attrs) {
-  task_registry.init_task_ids.insert({op_id, std::nullopt});
-  task_registry.forward_task_ids.insert({op_id, std::nullopt});
-  task_registry.backward_task_ids.insert({op_id, std::nullopt});
+    ComputationGraphOpAttrs attrs = get_layer_attrs(cg, node).attrs;
+    std::vector<task_id_t> task_ids = get_task_ids(attrs);
 
-  // register tasks
-  std::vector<task_id_t> task_ids = get_task_ids(attrs);
-  for (task_id_t task_id : task_ids) {
-    TaskSignatureAndImpl task_signature_impl = get_task_sig_impl(task_id);
-    switch (task_signature_impl.task_signature.type) {
-      case OpTaskType::INIT:
-        assert(is_invocation_valid(task_signature_impl.task_signature,
-                                   init(attrs)));
-        task_registry.init_task_ids[op_id] = task_id;
-        break;
-      case OpTaskType::FWD:
-        assert(is_invocation_valid(task_signature_impl.task_signature,
-                                   forward(attrs)));
-        task_registry.forward_task_ids[op_id] = task_id;
-        break;
-      case OpTaskType::BWD:
-        assert(is_invocation_valid(task_signature_impl.task_signature,
-                                   backward(attrs)));
-        task_registry.backward_task_ids[op_id] = task_id;
-        break;
-      default:
-        throw mk_runtime_error(
-            fmt::format("Invalid OpTaskType, got {}",
-                        task_signature_impl.task_signature.type));
+    for (task_id_t const &task_id : task_ids) {
+      TaskSignatureAndImpl task_signature_impl = get_task_sig_impl(task_id);
+      switch (task_signature_impl.task_signature.type) {
+        case OpTaskType::INIT:
+          assert(is_invocation_valid(task_signature_impl.task_signature,
+                                     init(attrs)));
+          init_task_ids[node] = task_id;
+          break;
+        case OpTaskType::FWD:
+          assert(is_invocation_valid(task_signature_impl.task_signature,
+                                     init(attrs)));
+          fwd_task_ids[node] = task_id;
+          break;
+        case OpTaskType::BWD:
+          assert(is_invocation_valid(task_signature_impl.task_signature,
+                                     init(attrs)));
+          fwd_task_ids[node] = task_id;
+          break;
+        default:
+          throw mk_runtime_error(
+              fmt::format("Invalid OpTaskType, got {}",
+                          task_signature_impl.task_signature.type));
+      }
+      task_mapping.insert({task_id, task_signature_impl});
     }
-    task_registry.task_mapping.insert({task_id, task_signature_impl});
   }
+
+  return TaskRegistry{init_task_ids, fwd_task_ids, bwd_task_ids, task_mapping};
 }
 
 bool registry_contains_task_for_layer(TaskRegistry const &task_registry,
@@ -66,12 +71,4 @@ bool registry_contains_task_for_layer(TaskRegistry const &task_registry,
   return task_ids.at(op).has_value();
 }
 
-void register_all_computation_graph_tasks(TaskRegistry &registry,
-                                          ComputationGraph const &cg) {
-  for (layer_guid_t const &node : topological_ordering(cg)) {
-    ComputationGraphOpAttrs attrs = get_layer_attrs(cg, node).attrs;
-    register_tasks_for_layer(registry, node, attrs);
-  }
-}
-
 } // namespace FlexFlow
diff --git a/lib/pcg/include/pcg/computation_graph.h b/lib/pcg/include/pcg/computation_graph.h
index 9d4d8c85c1..c473ae1f40 100644
--- a/lib/pcg/include/pcg/computation_graph.h
+++ b/lib/pcg/include/pcg/computation_graph.h
@@ -34,6 +34,10 @@ std::vector<tensor_guid_t> get_incoming_tensors(ComputationGraph const &cg,
 
 std::vector<tensor_guid_t> get_incoming_inputs(ComputationGraph const &,
                                                layer_guid_t const &);
+
+std::vector<TensorShape> get_incoming_input_shapes(ComputationGraph const &,
+                                                   layer_guid_t const &);
+
 std::vector<tensor_guid_t> get_incoming_weights(ComputationGraph const &,
                                                 layer_guid_t const &);
 
diff --git a/lib/pcg/src/pcg/computation_graph.cc b/lib/pcg/src/pcg/computation_graph.cc
index 312488bdf5..74448f18bc 100644
--- a/lib/pcg/src/pcg/computation_graph.cc
+++ b/lib/pcg/src/pcg/computation_graph.cc
@@ -84,6 +84,13 @@ std::vector<tensor_guid_t> get_incoming_tensors(ComputationGraph const &cg,
                    [](DataflowOutput const &o) { return tensor_guid_t{o}; });
 }
 
+std::vector<TensorShape> get_incoming_input_shapes(ComputationGraph const &cg,
+                                                   layer_guid_t n) {
+  return transform(get_incoming_inputs(cg, n), [&](tensor_guid_t const &t) {
+    return get_tensor_attrs(cg, t).shape;
+  });
+}
+
 static std::vector<tensor_guid_t>
     get_incoming_tensors_with_role(ComputationGraph const &cg,
                                    layer_guid_t const &l,
diff --git a/lib/task-spec/include/task-spec/op_task_to_task_invocation.h b/lib/task-spec/include/task-spec/op_task_to_task_invocation.h
index 0c5fdb39a4..68c7f05d77 100644
--- a/lib/task-spec/include/task-spec/op_task_to_task_invocation.h
+++ b/lib/task-spec/include/task-spec/op_task_to_task_invocation.h
@@ -13,7 +13,10 @@ namespace FlexFlow {
 TaskInvocation lower_to_task_invocation(
     OpTaskInvocation const &,
     layer_guid_t const &,
-    ComputationGraph const &,
+    std::vector<tensor_guid_t> const &input_tensors,
+    std::vector<TensorShape> const &input_tensor_shapes,
+    std::vector<tensor_guid_t> const &output_tensors,
+    std::vector<tensor_guid_t> const &weight_tensors,
     std::unordered_map<tensor_guid_t, gradient_tensor_t> const &,
     std::optional<DeviceSpecificDeviceStates> const &);
 
@@ -22,7 +25,7 @@ ConcreteArgSpec lower_to_concrete_arg_spec(RuntimeArgRefSpec const &,
 
 ConcreteArgSpec lower_to_concrete_arg_spec(
     OpArgRefSpec const &,
-    ComputationGraph const &,
+    std::vector<TensorShape> const &,
     layer_guid_t const &,
     std::optional<DeviceSpecificDeviceStates> const &);
 
diff --git a/lib/task-spec/src/op_task_to_task_invocation.cc b/lib/task-spec/src/op_task_to_task_invocation.cc
index f52800a8de..515d1dc1dc 100644
--- a/lib/task-spec/src/op_task_to_task_invocation.cc
+++ b/lib/task-spec/src/op_task_to_task_invocation.cc
@@ -7,18 +7,14 @@ namespace FlexFlow {
 TaskInvocation lower_to_task_invocation(
     OpTaskInvocation const &op_task_invocation,
     layer_guid_t const &layer_guid,
-    ComputationGraph const &computation_graph,
+    std::vector<tensor_guid_t> const &input_tensors,
+    std::vector<TensorShape> const &input_tensor_shapes,
+    std::vector<tensor_guid_t> const &output_tensors,
+    std::vector<tensor_guid_t> const &weight_tensors,
     std::unordered_map<tensor_guid_t, gradient_tensor_t> const
         &tensor_gradient_mapping,
     std::optional<DeviceSpecificDeviceStates> const &device_states) {
   TaskBinding binding;
-  // tensors
-  std::vector<tensor_guid_t> input_tensors =
-      get_incoming_inputs(computation_graph, layer_guid);
-  std::vector<tensor_guid_t> output_tensors =
-      get_outgoing_tensors(computation_graph, layer_guid);
-  std::vector<tensor_guid_t> weight_tensors =
-      get_incoming_weights(computation_graph, layer_guid);
 
   for (auto const &tensor_binding :
        op_task_invocation.binding.get_tensor_bindings()) {
@@ -56,7 +52,7 @@ TaskInvocation lower_to_task_invocation(
     if (arg_binding.second.has<OpArgRefSpec>()) {
       ConcreteArgSpec concrete_arg =
           lower_to_concrete_arg_spec(arg_binding.second.get<OpArgRefSpec>(),
-                                     computation_graph,
+                                     input_tensor_shapes,
                                      layer_guid,
                                      device_states);
       binding.insert_arg_spec(arg_binding.first, TaskArgSpec{concrete_arg});
@@ -76,7 +72,7 @@ TaskInvocation lower_to_task_invocation(
 
 ConcreteArgSpec lower_to_concrete_arg_spec(
     OpArgRefSpec const &op_arg_ref_spec,
-    ComputationGraph const &cg,
+    std::vector<TensorShape> const &input_tensor_shapes,
     layer_guid_t const &op_guid,
     std::optional<DeviceSpecificDeviceStates> const &device_states) {
   if (op_arg_ref_spec.holds<DeviceSpecificDeviceStates>()) {
@@ -86,10 +82,9 @@ ConcreteArgSpec lower_to_concrete_arg_spec(
   } else if (op_arg_ref_spec.holds<ParallelTensorShape>()) {
     ParallelTensorShapeRefType index_op_arg_ref =
         op_arg_ref_spec.get_ref_type().get<ParallelTensorShapeRefType>();
-    tensor_guid_t input_tensor =
-        get_incoming_inputs(cg, op_guid).at(index_op_arg_ref.idx);
-    TensorAttrs tensor_attrs = get_tensor_attrs(cg, input_tensor);
-    ParallelTensorShape shape = lift_to_parallel(tensor_attrs.shape);
+    TensorShape input_tensor_shape =
+        input_tensor_shapes.at(index_op_arg_ref.idx);
+    ParallelTensorShape shape = lift_to_parallel(input_tensor_shape);
     return ConcreteArgSpec::create(shape);
   } else {
     throw mk_runtime_error("Unhandled op arg ref type");

From 187a8d53a1bb5d62ba2f5039ff58a32c0a5a2187 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Thu, 13 Feb 2025 12:21:07 -0800
Subject: [PATCH 41/91] Add tests for allocated and unallocated

---
 lib/kernels/include/kernels/array_shape.h     |   7 +
 lib/kernels/src/array_shape.cc                |  11 +
 .../local-execution/allocated_tensors.h       |  17 +-
 .../allocated_tensors.struct.toml             |   5 +-
 .../local-execution/local_tensor_backing.h    |   4 +-
 .../unallocated_tensors.struct.toml           |   5 +-
 lib/local-execution/src/allocated_tensors.cc  | 109 ++---
 .../src/local_cost_estimator.cc               |  10 +-
 .../src/local_tensor_backing.cc               |  43 +-
 .../src/local_training_backing.cc             |   4 +-
 lib/local-execution/test/CMakeLists.txt       |   4 +-
 .../test/src/test_allocated_tensors.cc        | 221 ++++++++++
 .../test/src/test_unallocated_tensors.cc      | 383 ++++++++++++++++++
 lib/local-execution/test/src/test_utils.cc    |  10 +
 lib/local-execution/test/src/test_utils.h     |  10 +
 lib/pcg/include/pcg/computation_graph.h       |   2 +
 lib/pcg/src/pcg/computation_graph.cc          |  10 +
 lib/utils/include/utils/required_core.h       |   2 +-
 18 files changed, 766 insertions(+), 91 deletions(-)
 create mode 100644 lib/local-execution/test/src/test_allocated_tensors.cc
 create mode 100644 lib/local-execution/test/src/test_unallocated_tensors.cc

diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h
index 95d20ceca3..7e14bf41ad 100644
--- a/lib/kernels/include/kernels/array_shape.h
+++ b/lib/kernels/include/kernels/array_shape.h
@@ -69,4 +69,11 @@ std::ostream &operator<<(std::ostream &, ArrayShape const &);
 
 } // namespace FlexFlow
 
+namespace std {
+template <>
+struct hash<::FlexFlow::ArrayShape> {
+  size_t operator()(::FlexFlow::ArrayShape const &) const;
+};
+} // namespace std
+
 #endif
diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
index 220f8ebeea..521b15e435 100644
--- a/lib/kernels/src/array_shape.cc
+++ b/lib/kernels/src/array_shape.cc
@@ -128,3 +128,14 @@ std::ostream &operator<<(std::ostream &s, ArrayShape const &x) {
 }
 
 } // namespace FlexFlow
+
+namespace std {
+size_t hash<FlexFlow::ArrayShape>::operator()(
+    ::FlexFlow::ArrayShape const &x) const {
+  size_t result = 0;
+  result ^= std::hash<::FlexFlow::LegionOrdered<::FlexFlow::nonnegative_int>>{}(
+                x.dims) +
+            0x9e3779b9 + (result << 6) + (result >> 2);
+  return result;
+}
+} // namespace std
diff --git a/lib/local-execution/include/local-execution/allocated_tensors.h b/lib/local-execution/include/local-execution/allocated_tensors.h
index 60ee662ba8..0d01350d9f 100644
--- a/lib/local-execution/include/local-execution/allocated_tensors.h
+++ b/lib/local-execution/include/local-execution/allocated_tensors.h
@@ -6,12 +6,17 @@
 
 namespace FlexFlow {
 
-bool are_allocated_forward_tensors_valid(AllocatedTensors const &,
-                                         ComputationGraph const &);
-bool are_allocated_gradient_tensors_valid(AllocatedTensors const &,
-                                          ComputationGraph const &);
-bool are_allocated_optimizer_tensors_valid(AllocatedTensors const &,
-                                           ComputationGraph const &);
+bool are_allocated_forward_tensors_valid(
+    AllocatedTensors const &,
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &);
+bool are_allocated_gradient_tensors_valid(
+    AllocatedTensors const &,
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &);
+bool are_allocated_optimizer_tensors_valid(
+    AllocatedTensors const &,
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &);
+
+bool are_allocated_tensors_valid(AllocatedTensors const &, std::unordered_map<tensor_guid_t, TensorAttrs> const &);
 
 bool is_allocated_tensor_backing_valid(
     TensorTypeVariant const &,
diff --git a/lib/local-execution/include/local-execution/allocated_tensors.struct.toml b/lib/local-execution/include/local-execution/allocated_tensors.struct.toml
index e4be709709..09245097b4 100644
--- a/lib/local-execution/include/local-execution/allocated_tensors.struct.toml
+++ b/lib/local-execution/include/local-execution/allocated_tensors.struct.toml
@@ -4,7 +4,6 @@ features = [
   "eq",
   "fmt",
   "hash",
-  "ord"
 ]
 
 includes = [
@@ -14,7 +13,9 @@ includes = [
 
 src_includes = [
   "utils/hash/unordered_map.h",
-  "utils/fmt/unordered_map.h"
+  "utils/fmt/unordered_map.h",
+  "utils/hash/vector.h",
+  "utils/fmt/vector.h"
 ]
 
 [[fields]]
diff --git a/lib/local-execution/include/local-execution/local_tensor_backing.h b/lib/local-execution/include/local-execution/local_tensor_backing.h
index 86244eab13..a43f1a2c81 100644
--- a/lib/local-execution/include/local-execution/local_tensor_backing.h
+++ b/lib/local-execution/include/local-execution/local_tensor_backing.h
@@ -54,12 +54,12 @@ struct LocalTensorBacking {
 };
 
 UnallocatedTensors generate_unallocated_tensors(AllocatedTensors const &,
-                                                ComputationGraph const &,
+                                                std::unordered_map<tensor_guid_t, TensorAttrs> const &,
                                                 GradientTensorSource &);
 
 UnallocatedTensors
     generate_unallocated_tensors_with_optimizer(AllocatedTensors const &,
-                                                ComputationGraph const &,
+                                                std::unordered_map<tensor_guid_t, TensorAttrs> const &,
                                                 GradientTensorSource &,
                                                 OptimizerTensorSource &,
                                                 OptimizerAttrs const &);
diff --git a/lib/local-execution/include/local-execution/unallocated_tensors.struct.toml b/lib/local-execution/include/local-execution/unallocated_tensors.struct.toml
index 87abf83d13..e86cc2a532 100644
--- a/lib/local-execution/include/local-execution/unallocated_tensors.struct.toml
+++ b/lib/local-execution/include/local-execution/unallocated_tensors.struct.toml
@@ -4,7 +4,6 @@ features = [
   "eq",
   "fmt",
   "hash",
-  "ord"
 ]
 
 includes = [
@@ -14,7 +13,9 @@ includes = [
 
 src_includes = [
   "utils/hash/unordered_map.h",
-  "utils/fmt/unordered_map.h"
+  "utils/fmt/unordered_map.h",
+  "utils/hash/vector.h",
+  "utils/fmt/vector.h"
 ]
 
 [[fields]]
diff --git a/lib/local-execution/src/allocated_tensors.cc b/lib/local-execution/src/allocated_tensors.cc
index e64db0cfff..19b149e7bd 100644
--- a/lib/local-execution/src/allocated_tensors.cc
+++ b/lib/local-execution/src/allocated_tensors.cc
@@ -22,17 +22,26 @@ bool is_allocated_tensor_backing_valid(
 
 bool are_allocated_forward_tensors_valid(
     AllocatedTensors const &allocated_tensors,
-    ComputationGraph const &computation_graph) {
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs) {
+
   std::unordered_set<tensor_guid_t> all_tensor_guids =
-      set_union(keys(allocated_tensors.gradient_mapping),
-                keys(allocated_tensors.optimizer_mapping));
+      transform(
+          keys(filter_keys(allocated_tensors.tensor_type_backings,
+                           [&](TensorTypeVariant const &k) {
+                             return k.has<tensor_guid_t>();
+                           })),
+          [&](TensorTypeVariant const &t) { return t.get<tensor_guid_t>(); }
+    );
+
   for (tensor_guid_t const &tensor_guid : all_tensor_guids) {
-    TensorAttrs expected_tensor_attrs =
-        get_tensor_attrs(computation_graph, tensor_guid);
-    if (!is_allocated_tensor_backing_valid(
-            TensorTypeVariant{tensor_guid},
-            allocated_tensors.tensor_type_backings,
-            ArrayShape{expected_tensor_attrs.shape})) {
+    if (tensor_attrs.count(tensor_guid)) {
+      if (!is_allocated_tensor_backing_valid(
+              TensorTypeVariant{tensor_guid},
+              allocated_tensors.tensor_type_backings,
+              ArrayShape{tensor_attrs.at(tensor_guid).shape})) {
+        return false;
+      }
+    } else {
       return false;
     }
   }
@@ -41,30 +50,29 @@ bool are_allocated_forward_tensors_valid(
 
 bool are_allocated_gradient_tensors_valid(
     AllocatedTensors const &allocated_tensors,
-    ComputationGraph const &computation_graph) {
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs) {
   std::unordered_set<TensorTypeVariant>
-      tensors_in_mappings; // will check whether any dangling gradient tensors
-                           // were allocated
+      tensors_in_mappings; // will check for dangling gradient tensors
 
   for (std::pair<tensor_guid_t, gradient_tensor_t> const &tensor_to_grad :
        allocated_tensors.gradient_mapping) {
-    TensorAttrs expected_tensor_attrs =
-        get_tensor_attrs(computation_graph, tensor_to_grad.first);
-    if (expected_tensor_attrs.create_gradients == CreateGrad::NO) {
-      return false;
-    }
+    if (tensor_attrs.count(tensor_to_grad.first)) {
+      if (tensor_attrs.at(tensor_to_grad.first).create_gradients ==
+          CreateGrad::NO) {
+        return false;
+      }
 
-    ArrayShape tensor_guid_array_shape =
-        allocated_tensors.tensor_type_backings
-            .at(TensorTypeVariant{tensor_to_grad.first})
-            .shape;
-    TensorTypeVariant gradient_tensor =
-        TensorTypeVariant{tensor_to_grad.second};
-    if (is_allocated_tensor_backing_valid(
-            gradient_tensor,
-            allocated_tensors.tensor_type_backings,
-            tensor_guid_array_shape)) {
-      tensors_in_mappings.insert(gradient_tensor);
+      ArrayShape tensor_guid_array_shape = ArrayShape{tensor_attrs.at(tensor_to_grad.first).shape};
+      TensorTypeVariant gradient_tensor =
+          TensorTypeVariant{tensor_to_grad.second};
+      if (is_allocated_tensor_backing_valid(
+              gradient_tensor,
+              allocated_tensors.tensor_type_backings,
+              tensor_guid_array_shape)) {
+        tensors_in_mappings.insert(gradient_tensor);
+      } else {
+        return false;
+      }
     } else {
       return false;
     }
@@ -83,33 +91,30 @@ bool are_allocated_gradient_tensors_valid(
 
 bool are_allocated_optimizer_tensors_valid(
     AllocatedTensors const &allocated_tensors,
-    ComputationGraph const &computation_graph) {
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs) {
   std::unordered_set<TensorTypeVariant>
-      tensors_in_mappings; // will check whether any dangling optimizer tensors
-                           // were allocated
+      tensors_in_mappings; // will check for dangling optimizer tensors
 
   for (std::pair<tensor_guid_t, std::vector<optimizer_tensor_t>> const
            &tensor_to_optimizers : allocated_tensors.optimizer_mapping) {
-    TensorAttrs expected_tensor_attrs =
-        get_tensor_attrs(computation_graph, tensor_to_optimizers.first);
-    if (expected_tensor_attrs.create_gradients == CreateGrad::NO) {
-      return false;
-    }
-
-    ArrayShape tensor_guid_array_shape =
-        allocated_tensors.tensor_type_backings
-            .at(TensorTypeVariant{tensor_to_optimizers.first})
-            .shape;
-    for (optimizer_tensor_t const &optimizer_tensor :
-         tensor_to_optimizers.second) {
-      if (is_allocated_tensor_backing_valid(
-              TensorTypeVariant{optimizer_tensor},
-              allocated_tensors.tensor_type_backings,
-              tensor_guid_array_shape)) {
-        tensors_in_mappings.insert(TensorTypeVariant{optimizer_tensor});
-      } else {
+    if (tensor_attrs.count(tensor_to_optimizers.first)) {
+      if (tensor_attrs.at(tensor_to_optimizers.first).create_gradients ==
+          CreateGrad::NO) {
         return false;
       }
+
+      ArrayShape tensor_guid_array_shape = ArrayShape{tensor_attrs.at(tensor_to_optimizers.first).shape};
+      for (optimizer_tensor_t const &optimizer_tensor :
+           tensor_to_optimizers.second) {
+        if (is_allocated_tensor_backing_valid(
+                TensorTypeVariant{optimizer_tensor},
+                allocated_tensors.tensor_type_backings,
+                tensor_guid_array_shape)) {
+          tensors_in_mappings.insert(TensorTypeVariant{optimizer_tensor});
+        } else {
+          return false;
+        }
+      }
     }
   }
 
@@ -125,4 +130,10 @@ bool are_allocated_optimizer_tensors_valid(
   return true;
 }
 
+bool are_allocated_tensors_valid(AllocatedTensors const & allocated_tensors, std::unordered_map<tensor_guid_t, TensorAttrs> const & tensor_attrs) {
+  return are_allocated_forward_tensors_valid(allocated_tensors, tensor_attrs) 
+    && are_allocated_gradient_tensors_valid(allocated_tensors, tensor_attrs)
+    && are_allocated_optimizer_tensors_valid(allocated_tensors, tensor_attrs);
+}
+
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
index 31418c6bea..c5c2fafa9d 100644
--- a/lib/local-execution/src/local_cost_estimator.cc
+++ b/lib/local-execution/src/local_cost_estimator.cc
@@ -89,19 +89,13 @@ CostDetails LocalCostEstimator::estimate_cost(
 
   LocalTrainingBacking local_backing(
       allocator,
+      AllocatedTensors{{}, {}, {}},
       computation_graph,
-      LocalTensorBacking{},
-      LocalArgsBacking{this->runtime_arg_config});
-
-  allocate_all_computation_graph_tensors(local_backing.local_tensor_backing,
-                                         local_backing.gradient_tensor_source,
-                                         local_backing.computation_graph,
-                                         local_backing.allocator);
+      this->runtime_arg_config);
 
   // execute layer
   layer_guid_t operator_layer_guid =
       get_layer_by_name(computation_graph, "operator");
-  execute_init(local_backing, operator_layer_guid);
   float fwd = execute_forward(local_backing, operator_layer_guid).value();
   float bwd = execute_backward(local_backing, operator_layer_guid).value();
 
diff --git a/lib/local-execution/src/local_tensor_backing.cc b/lib/local-execution/src/local_tensor_backing.cc
index 67bbd59c3b..c37cfc5fc4 100644
--- a/lib/local-execution/src/local_tensor_backing.cc
+++ b/lib/local-execution/src/local_tensor_backing.cc
@@ -61,21 +61,25 @@ lowered_tensor_t
     LocalTensorBacking::insert_tensor(TensorTypeVariant const &tensor_type) {
   lowered_tensor_t lowered_tensor =
       this->lowered_tensor_source.new_lowered_tensor();
-  tensor_type.visit<void>(overload{
+  tensor_type.visit<std::nullopt_t>(overload{
       [&](tensor_guid_t const &tensor_guid) {
         this->tensor_lowering_mapping.insert({tensor_guid, lowered_tensor});
+        return std::nullopt;
       },
       [&](gradient_tensor_t const &gradient_tensor) {
         this->gradient_tensor_lowering_mapping.insert(
             {gradient_tensor, lowered_tensor});
+        return std::nullopt;
       },
       [&](optimizer_tensor_t const &optimizer_tensor) {
         this->optimizer_tensor_lowering_mapping.insert(
             {optimizer_tensor, lowered_tensor});
+        return std::nullopt;
       },
       [&](loss_tensor_t const &loss_tensor) {
         this->loss_tensor_lowering_mapping.insert(
             {loss_tensor, lowered_tensor});
+        return std::nullopt;
       },
       [&](auto const &any_tensor) {
         throw mk_runtime_error(
@@ -88,16 +92,16 @@ GenericTensorAccessorW
     LocalTensorBacking::get_tensor(TensorTypeVariant const &tensor_type) const {
   lowered_tensor_t lowered_tensor = tensor_type.visit<lowered_tensor_t>(
       overload{[&](tensor_guid_t const &tensor_guid) {
-                 this->tensor_lowering_mapping.at(tensor_guid);
+                 return this->tensor_lowering_mapping.at(tensor_guid);
                },
                [&](gradient_tensor_t const &gradient_tensor) {
-                 this->gradient_tensor_lowering_mapping.at(gradient_tensor);
+                 return this->gradient_tensor_lowering_mapping.at(gradient_tensor);
                },
                [&](optimizer_tensor_t const &optimizer_tensor) {
-                 this->optimizer_tensor_lowering_mapping.at(optimizer_tensor);
+                 return this->optimizer_tensor_lowering_mapping.at(optimizer_tensor);
                },
                [&](loss_tensor_t const &loss_tensor) {
-                 this->loss_tensor_lowering_mapping.at(loss_tensor);
+                 return this->loss_tensor_lowering_mapping.at(loss_tensor);
                },
                [&](auto const &any_tensor) {
                  throw mk_runtime_error(
@@ -108,18 +112,18 @@ GenericTensorAccessorW
 
 UnallocatedTensors
     generate_unallocated_tensors(AllocatedTensors const &allocated_tensors,
-                                 ComputationGraph const &computation_graph,
+                                std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs_mapping,
                                  GradientTensorSource &gradient_tensor_source) {
-  assert(are_allocated_forward_tensors_valid(allocated_tensors,
-                                             computation_graph));
-  assert(are_allocated_gradient_tensors_valid(allocated_tensors,
-                                              computation_graph));
+
+  assert(are_allocated_tensors_valid(
+      allocated_tensors, tensor_attrs_mapping));
 
   std::unordered_map<TensorTypeVariant, TensorShape> tensor_type_shapes;
   std::unordered_map<tensor_guid_t, gradient_tensor_t> gradient_mapping;
 
-  for (tensor_guid_t const &tensor_guid : get_all_tensors(computation_graph)) {
-    TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor_guid);
+  for (std::pair<tensor_guid_t, TensorAttrs> const &tensor_guid_attrs : tensor_attrs_mapping) {
+    tensor_guid_t tensor_guid = tensor_guid_attrs.first;
+    TensorAttrs tensor_attrs = tensor_guid_attrs.second;
     TensorTypeVariant tensor_guid_type = TensorTypeVariant{tensor_guid};
     if (!allocated_tensors.tensor_type_backings.count(tensor_guid_type)) {
       tensor_type_shapes.insert({tensor_guid_type, tensor_attrs.shape});
@@ -140,15 +144,17 @@ UnallocatedTensors
 
 UnallocatedTensors generate_unallocated_tensors_with_optimizer(
     AllocatedTensors const &allocated_tensors,
-    ComputationGraph const &computation_graph,
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs_mapping,
     GradientTensorSource &gradient_tensor_source,
     OptimizerTensorSource &optimizer_tensor_source,
     OptimizerAttrs const &optimizer_attrs) {
 
   UnallocatedTensors unallocated_tensors = generate_unallocated_tensors(
-      allocated_tensors, computation_graph, gradient_tensor_source);
-  assert(are_allocated_optimizer_tensors_valid(allocated_tensors,
-                                               computation_graph));
+      allocated_tensors, tensor_attrs_mapping, gradient_tensor_source);
+  
+  if (!get_num_optimizer_tensors(optimizer_attrs)) {
+    return unallocated_tensors;
+  }
 
   std::unordered_map<TensorTypeVariant, TensorShape> tensor_type_shapes =
       unallocated_tensors.tensor_type_shapes;
@@ -157,8 +163,9 @@ UnallocatedTensors generate_unallocated_tensors_with_optimizer(
   std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
       optimizer_mapping;
 
-  for (tensor_guid_t const &tensor_guid : get_all_tensors(computation_graph)) {
-    TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor_guid);
+  for (std::pair<tensor_guid_t, TensorAttrs> const &tensor_guid_attrs : tensor_attrs_mapping) {
+    tensor_guid_t tensor_guid = tensor_guid_attrs.first;
+    TensorAttrs tensor_attrs = tensor_guid_attrs.second;
     if (tensor_attrs.create_gradients == CreateGrad::YES &&
         !allocated_tensors.optimizer_mapping.count(tensor_guid)) {
       std::vector<optimizer_tensor_t> optimizer_tensors;
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index 8a0dc825eb..cb22240b7f 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -24,7 +24,7 @@ LocalTrainingBacking::LocalTrainingBacking(
       local_tensor_backing(
           allocated_tensors,
           generate_unallocated_tensors(allocated_tensors,
-                                       computation_graph,
+                                       get_all_tensor_attrs(this->computation_graph),
                                        this->gradient_tensor_source),
           allocator),
       local_args_backing(initialize_args_backing(this->task_registry,
@@ -43,7 +43,7 @@ LocalTrainingBacking::LocalTrainingBacking(
       local_tensor_backing(allocated_tensors,
                            generate_unallocated_tensors_with_optimizer(
                                allocated_tensors,
-                               computation_graph,
+                               get_all_tensor_attrs(this->computation_graph),
                                this->gradient_tensor_source,
                                this->optimizer_tensor_source,
                                optimizer_attrs),
diff --git a/lib/local-execution/test/CMakeLists.txt b/lib/local-execution/test/CMakeLists.txt
index 930ab5c4e2..6e3d890176 100644
--- a/lib/local-execution/test/CMakeLists.txt
+++ b/lib/local-execution/test/CMakeLists.txt
@@ -2,7 +2,9 @@ ff_add_test_executable(
   NAME
     local-execution-tests
   SRC_PATTERNS
-    src/*.cc
+    src/test_allocated_tensors.cc
+    src/test_unallocated_tensors.cc
+    src/test_utils.cc
   PRIVATE_INCLUDE 
     src/
   DEPS
diff --git a/lib/local-execution/test/src/test_allocated_tensors.cc b/lib/local-execution/test/src/test_allocated_tensors.cc
new file mode 100644
index 0000000000..59537cfae1
--- /dev/null
+++ b/lib/local-execution/test/src/test_allocated_tensors.cc
@@ -0,0 +1,221 @@
+#include "local-execution/allocated_tensors.h"
+#include "local-execution/local_cpu_allocator.h"
+#include "local-execution/gradient_tensor_source.h"
+#include "local-execution/optimizer_tensor_source.h"
+#include "local-execution/loss_tensor_source.h"
+#include "pcg/computation_graph.dtg.h"
+#include "test/utils/doctest/fmt/pair.h"
+#include "test/utils/doctest/fmt/unordered_map.h"
+#include "test/utils/doctest/fmt/variant.h"
+#include "test/utils/doctest/fmt/vector.h"
+#include "test_utils.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("AllocatedTensors") {
+    MockTensorGuidSource tensor_guid_source;
+    GradientTensorSource gradient_tensor_source;
+    OptimizerTensorSource optimizer_tensor_source;
+    LossTensorSource loss_tensor_source;
+
+    Allocator allocator = create_local_cpu_memory_allocator();
+
+    tensor_guid_t mock_tensor_1 = tensor_guid_source.new_mock_tensor_guid();
+    tensor_guid_t mock_tensor_2 = tensor_guid_source.new_mock_tensor_guid();
+    tensor_guid_t mock_tensor_3_with_grad = tensor_guid_source.new_mock_tensor_guid();
+    tensor_guid_t dangling_tensor = tensor_guid_source.new_mock_tensor_guid();
+
+    TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{
+      TensorShape{
+        TensorDims{
+          FFOrdered<nonnegative_int>{16, 10}
+        },
+        DataType::FLOAT
+      },
+      std::nullopt,
+      std::nullopt,
+      CreateGrad::NO
+    };
+    TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{
+      TensorShape{
+        TensorDims{
+          FFOrdered<nonnegative_int>{16, 20}
+        },
+        DataType::FLOAT
+      },
+      std::nullopt,
+      std::nullopt,
+      CreateGrad::NO
+    };
+    TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{
+      TensorShape{
+        TensorDims{
+          FFOrdered<nonnegative_int>{16, 30}
+        },
+        DataType::FLOAT
+      },
+      std::nullopt,
+      std::nullopt,
+      CreateGrad::YES
+    };
+
+    GenericTensorAccessorW tensor_backing_1 = allocator.allocate_tensor(tensor_attrs_1_no_grad.shape);
+    GenericTensorAccessorW tensor_backing_2 = allocator.allocate_tensor(tensor_attrs_2_no_grad.shape);
+    GenericTensorAccessorW tensor_backing_3 = allocator.allocate_tensor(tensor_attrs_3_with_grad.shape);
+
+    std::unordered_map<tensor_guid_t, TensorAttrs> tensor_attrs_mapping = {
+      {mock_tensor_1, tensor_attrs_1_no_grad},
+      {mock_tensor_2, tensor_attrs_2_no_grad},
+      {mock_tensor_3_with_grad, tensor_attrs_3_with_grad},
+    };
+
+    SUBCASE("Trivial tensors") {
+      SUBCASE("Empty") {
+        AllocatedTensors allocated_tensors = AllocatedTensors{{}, {}, {}};
+        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
+        CHECK (result == true);
+      }
+  
+      SUBCASE("Loss tensor") {
+        loss_tensor_t loss_tensor = loss_tensor_source.new_loss_tensor();
+        AllocatedTensors allocated_tensors = AllocatedTensors{{
+          {TensorTypeVariant{loss_tensor}, tensor_backing_1}
+        }, {}, {}};
+        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
+        CHECK (result == true);
+      }
+    }
+
+    SUBCASE("Forward tensors") {
+      SUBCASE("Correct forward tensor") {
+        AllocatedTensors allocated_tensors = AllocatedTensors{{
+          {TensorTypeVariant{mock_tensor_1}, tensor_backing_1}
+        }, {}, {}};
+        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
+        CHECK (result == true);
+      }
+  
+      SUBCASE("Incorrect forward tensor") {
+        AllocatedTensors allocated_tensors = AllocatedTensors{{
+          {TensorTypeVariant{mock_tensor_1}, tensor_backing_2}
+        }, {}, {}};
+        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
+        CHECK (result == false);
+      }
+  
+      SUBCASE("Dangling tensor guid") {
+        AllocatedTensors allocated_tensors = AllocatedTensors{{
+          {TensorTypeVariant{dangling_tensor}, tensor_backing_1},
+        }, {}, {}};
+        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
+        CHECK (result == false);
+      }
+    }
+    
+    SUBCASE("Gradient tensors") {
+      gradient_tensor_t grad_tensor_3 = gradient_tensor_source.new_gradient_tensor();
+
+      SUBCASE("Gradient tensor") {
+        AllocatedTensors allocated_tensors = AllocatedTensors{{
+          {TensorTypeVariant{grad_tensor_3}, tensor_backing_3}
+        }, {{mock_tensor_3_with_grad, grad_tensor_3}}, {}};
+        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
+        CHECK (result == true);
+      }
+  
+      SUBCASE("Dangling gradient tensor") {
+        AllocatedTensors allocated_tensors = AllocatedTensors{{
+          {TensorTypeVariant{grad_tensor_3}, tensor_backing_3}
+        }, {}, {}};
+        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
+        CHECK (result == false);
+      }
+
+      SUBCASE("Dangling gradient tensor in mapping") {
+        AllocatedTensors allocated_tensors = AllocatedTensors{{}, {
+          {mock_tensor_3_with_grad, grad_tensor_3}
+        }, {}};
+        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
+        CHECK (result == false);
+      }
+  
+      SUBCASE("Gradient allocated for forward tensor without gradient") {
+        AllocatedTensors allocated_tensors = AllocatedTensors{{
+          {TensorTypeVariant{grad_tensor_3}, tensor_backing_3}
+        }, {{mock_tensor_2, grad_tensor_3}}, {}};
+        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
+        CHECK (result == false);
+      }
+  
+      SUBCASE("Gradient tensor with wrong shape") {
+        AllocatedTensors allocated_tensors = AllocatedTensors{{
+          {TensorTypeVariant{grad_tensor_3}, tensor_backing_2}
+        }, {{mock_tensor_3_with_grad, grad_tensor_3}}, {}};
+        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
+        CHECK (result == false);
+      }
+  
+      SUBCASE("Gradient tensor with dangling tensor guid") {
+        AllocatedTensors allocated_tensors = AllocatedTensors{{
+          {TensorTypeVariant{grad_tensor_3}, tensor_backing_3}
+        }, {{dangling_tensor, grad_tensor_3}}, {}};
+        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
+        CHECK (result == false);
+      }
+    }
+    
+    SUBCASE("Optimizer tensors") {
+      optimizer_tensor_t optimizer_tensor_3 = optimizer_tensor_source.new_optimizer_tensor();
+
+      SUBCASE("Optimizer tensor") {
+        AllocatedTensors allocated_tensors = AllocatedTensors{{
+          {TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3}
+        }, {}, {{mock_tensor_3_with_grad, {optimizer_tensor_3}}}};
+        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
+        CHECK (result == true);
+      }
+  
+      SUBCASE("Dangling optimizer tensor") {
+        AllocatedTensors allocated_tensors = AllocatedTensors{{
+          {TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3}
+        }, {}, {}};
+        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
+        CHECK (result == false);
+      }
+
+      SUBCASE("Dangling optimizer tensor in mapping") {
+        AllocatedTensors allocated_tensors = AllocatedTensors{{}, {}, {
+          {mock_tensor_3_with_grad, {optimizer_tensor_3}}
+        }};
+        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
+        CHECK (result == false);
+      }
+  
+      SUBCASE("Optimizer allocated for forward tensor without gradient") {
+        AllocatedTensors allocated_tensors = AllocatedTensors{{
+          {TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3}
+        }, {}, {{mock_tensor_2, {optimizer_tensor_3}}}};
+        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
+        CHECK (result == false);
+      }
+  
+      SUBCASE("Optimizer tensor with wrong shape") {
+        AllocatedTensors allocated_tensors = AllocatedTensors{{
+          {TensorTypeVariant{optimizer_tensor_3}, tensor_backing_2}
+        }, {}, {{mock_tensor_3_with_grad, {optimizer_tensor_3}}}};
+        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
+        CHECK (result == false);
+      }
+  
+      SUBCASE("Optimizer tensor with dangling tensor guid") {
+        AllocatedTensors allocated_tensors = AllocatedTensors{{
+          {TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3}
+        }, {}, {{dangling_tensor, {optimizer_tensor_3}}}};
+        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
+        CHECK (result == false);
+      }
+    }
+  }
+}
diff --git a/lib/local-execution/test/src/test_unallocated_tensors.cc b/lib/local-execution/test/src/test_unallocated_tensors.cc
new file mode 100644
index 0000000000..9802821f3e
--- /dev/null
+++ b/lib/local-execution/test/src/test_unallocated_tensors.cc
@@ -0,0 +1,383 @@
+#include "local-execution/allocated_tensors.h"
+#include "local-execution/local_tensor_backing.h"
+#include "local-execution/local_cpu_allocator.h"
+#include "local-execution/gradient_tensor_source.h"
+#include "local-execution/optimizer_tensor_source.h"
+#include "local-execution/loss_tensor_source.h"
+#include "pcg/computation_graph.dtg.h"
+#include "test/utils/doctest/fmt/pair.h"
+#include "test/utils/doctest/fmt/unordered_map.h"
+#include "test/utils/doctest/fmt/variant.h"
+#include "test/utils/doctest/fmt/vector.h"
+#include "test_utils.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("UnallocatedTensors") {
+    MockTensorGuidSource tensor_guid_source;
+    OptimizerTensorSource optimizer_tensor_source;
+
+    Allocator allocator = create_local_cpu_memory_allocator();
+
+    tensor_guid_t mock_tensor_1 = tensor_guid_source.new_mock_tensor_guid();
+    tensor_guid_t mock_tensor_2 = tensor_guid_source.new_mock_tensor_guid();
+    tensor_guid_t mock_tensor_3_with_grad = tensor_guid_source.new_mock_tensor_guid();
+
+    TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{
+      TensorShape{
+        TensorDims{
+          FFOrdered<nonnegative_int>{16, 10}
+        },
+        DataType::FLOAT
+      },
+      std::nullopt,
+      std::nullopt,
+      CreateGrad::NO
+    };
+    TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{
+      TensorShape{
+        TensorDims{
+          FFOrdered<nonnegative_int>{16, 20}
+        },
+        DataType::FLOAT
+      },
+      std::nullopt,
+      std::nullopt,
+      CreateGrad::NO
+    };
+    TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{
+      TensorShape{
+        TensorDims{
+          FFOrdered<nonnegative_int>{16, 30}
+        },
+        DataType::FLOAT
+      },
+      std::nullopt,
+      std::nullopt,
+      CreateGrad::YES
+    };
+
+    GenericTensorAccessorW tensor_backing_1 = allocator.allocate_tensor(tensor_attrs_1_no_grad.shape);
+    GenericTensorAccessorW tensor_backing_2 = allocator.allocate_tensor(tensor_attrs_2_no_grad.shape);
+    GenericTensorAccessorW tensor_backing_3 = allocator.allocate_tensor(tensor_attrs_3_with_grad.shape);
+
+    std::unordered_map<tensor_guid_t, TensorAttrs> tensor_attrs_mapping = {
+      {mock_tensor_1, tensor_attrs_1_no_grad},
+      {mock_tensor_2, tensor_attrs_2_no_grad},
+      {mock_tensor_3_with_grad, tensor_attrs_3_with_grad},
+    };
+
+    SUBCASE("Without optimizer") {
+      SUBCASE("AllocatedTensors is empty") {
+        AllocatedTensors empty = AllocatedTensors{{}, {}, {}};
+        GradientTensorSource gradient_tensor_source;
+        UnallocatedTensors result = generate_unallocated_tensors(empty, tensor_attrs_mapping, gradient_tensor_source);
+        
+        GradientTensorSource mock_gradient_tensor_source;
+        gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor();
+        std::unordered_map<TensorTypeVariant, TensorShape> correct_tensor_type_shapes = {
+          {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape},
+          {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape},
+          {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape},
+          {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape},
+        };
+        UnallocatedTensors correct = UnallocatedTensors{
+          correct_tensor_type_shapes,
+          {{mock_tensor_3_with_grad, grad_tensor}},
+          {}
+        };
+        CHECK (result == correct);
+      }
+
+      SUBCASE("AllocatedTensors contains only 1 forward tensor") {
+        AllocatedTensors allocated_forward_tensors = AllocatedTensors{{
+          {TensorTypeVariant{mock_tensor_1}, tensor_backing_1},
+        }, {}, {}};
+        GradientTensorSource gradient_tensor_source;
+        UnallocatedTensors result = generate_unallocated_tensors(allocated_forward_tensors, tensor_attrs_mapping, gradient_tensor_source);
+        
+        GradientTensorSource mock_gradient_tensor_source;
+        gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor();
+        std::unordered_map<TensorTypeVariant, TensorShape> correct_tensor_type_shapes = {
+          {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape},
+          {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape},
+          {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape},
+        };
+        UnallocatedTensors correct = UnallocatedTensors{
+          correct_tensor_type_shapes,
+          {{mock_tensor_3_with_grad, grad_tensor}},
+          {}
+        };
+        CHECK (result == correct);
+      }
+
+      SUBCASE("AllocatedTensors contains only forward tensors") {
+        AllocatedTensors allocated_forward_tensors = AllocatedTensors{{
+          {TensorTypeVariant{mock_tensor_1}, tensor_backing_1},
+          {TensorTypeVariant{mock_tensor_2}, tensor_backing_2},
+          {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_backing_3},
+        }, {}, {}};
+        GradientTensorSource gradient_tensor_source;
+        UnallocatedTensors result = generate_unallocated_tensors(allocated_forward_tensors, tensor_attrs_mapping, gradient_tensor_source);
+        
+        GradientTensorSource mock_gradient_tensor_source;
+        gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor();
+        std::unordered_map<TensorTypeVariant, TensorShape> correct_tensor_type_shapes = {
+          {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape},
+        };
+        UnallocatedTensors correct = UnallocatedTensors{
+          correct_tensor_type_shapes,
+          {{mock_tensor_3_with_grad, grad_tensor}},
+          {}
+        };
+        CHECK (result == correct);
+      }
+
+      SUBCASE("AllocatedTensors contains only gradient tensor") {
+        GradientTensorSource gradient_tensor_source;
+        gradient_tensor_t grad_tensor = gradient_tensor_source.new_gradient_tensor();
+        AllocatedTensors allocated_forward_tensors = AllocatedTensors{{
+          {TensorTypeVariant{grad_tensor}, tensor_backing_3},
+        }, {{mock_tensor_3_with_grad, grad_tensor}}, {}};
+        UnallocatedTensors result = generate_unallocated_tensors(allocated_forward_tensors, tensor_attrs_mapping, gradient_tensor_source);
+        
+        std::unordered_map<TensorTypeVariant, TensorShape> correct_tensor_type_shapes = {
+          {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape},
+          {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape},
+          {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape},
+        };
+        UnallocatedTensors correct = UnallocatedTensors{
+          correct_tensor_type_shapes,
+          {},
+          {}
+        };
+        CHECK (result == correct);
+      }
+
+      SUBCASE("AllocatedTensors contains mixture") {
+        GradientTensorSource gradient_tensor_source;
+        gradient_tensor_t grad_tensor = gradient_tensor_source.new_gradient_tensor();
+        AllocatedTensors allocated_forward_tensors = AllocatedTensors{{
+          {TensorTypeVariant{mock_tensor_1}, tensor_backing_1},
+          {TensorTypeVariant{grad_tensor}, tensor_backing_3},
+        }, {{mock_tensor_3_with_grad, grad_tensor}}, {}};
+        UnallocatedTensors result = generate_unallocated_tensors(allocated_forward_tensors, tensor_attrs_mapping, gradient_tensor_source);
+        
+        std::unordered_map<TensorTypeVariant, TensorShape> correct_tensor_type_shapes = {
+          {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape},
+          {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape},
+        };
+        UnallocatedTensors correct = UnallocatedTensors{
+          correct_tensor_type_shapes,
+          {},
+          {}
+        };
+        CHECK (result == correct);
+      }
+
+      SUBCASE("Fully AllocatedTensors") {
+        GradientTensorSource gradient_tensor_source;
+        gradient_tensor_t grad_tensor = gradient_tensor_source.new_gradient_tensor();
+        AllocatedTensors allocated_forward_tensors = AllocatedTensors{{
+          {TensorTypeVariant{mock_tensor_1}, tensor_backing_1},
+          {TensorTypeVariant{mock_tensor_2}, tensor_backing_2},
+          {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_backing_3},
+          {TensorTypeVariant{grad_tensor}, tensor_backing_3},
+        }, {{mock_tensor_3_with_grad, grad_tensor}}, {}};
+        UnallocatedTensors result = generate_unallocated_tensors(allocated_forward_tensors, tensor_attrs_mapping, gradient_tensor_source);
+        
+        UnallocatedTensors correct = UnallocatedTensors{{}, {}, {}};
+        CHECK (result == correct);
+      }
+    }
+
+    SUBCASE("With optimizer") {
+      SUBCASE("SGD Attrs") {
+        SUBCASE("without momentum") {
+          double momentum = 0.0;
+          OptimizerAttrs attrs = OptimizerAttrs{SGDOptimizerAttrs{0.0, momentum, false, 0.0}};
+          AllocatedTensors empty = AllocatedTensors{{}, {}, {}};
+          GradientTensorSource gradient_tensor_source;
+          OptimizerTensorSource optimizer_tensour_source;
+          UnallocatedTensors result = generate_unallocated_tensors_with_optimizer(empty, 
+            tensor_attrs_mapping, gradient_tensor_source, optimizer_tensor_source, attrs);
+            
+          GradientTensorSource mock_gradient_tensor_source;
+          UnallocatedTensors correct = generate_unallocated_tensors(empty, tensor_attrs_mapping, mock_gradient_tensor_source);
+          CHECK (result == correct);
+        }
+        SUBCASE("with momentum") {
+          double momentum = 0.9;
+          OptimizerAttrs attrs = OptimizerAttrs{SGDOptimizerAttrs{0.0, momentum, false, 0.0}};
+
+          SUBCASE("unallocated") {
+            AllocatedTensors empty = AllocatedTensors{{}, {}, {}};
+            GradientTensorSource gradient_tensor_source;
+            OptimizerTensorSource optimizer_tensour_source;
+            UnallocatedTensors result = generate_unallocated_tensors_with_optimizer(empty, 
+              tensor_attrs_mapping, gradient_tensor_source, optimizer_tensor_source, attrs);
+              
+            GradientTensorSource mock_gradient_tensor_source;
+            gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor();
+            OptimizerTensorSource mock_optimizer_tensour_source;
+            optimizer_tensor_t optimizer_tensor = mock_optimizer_tensour_source.new_optimizer_tensor();
+            
+            std::unordered_map<TensorTypeVariant, TensorShape> correct_tensor_type_shapes = {
+              {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape},
+              {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape},
+              {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape},
+              {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape},
+              {TensorTypeVariant{optimizer_tensor}, tensor_attrs_3_with_grad.shape},
+            };
+            UnallocatedTensors correct = UnallocatedTensors{
+              correct_tensor_type_shapes,
+              {{mock_tensor_3_with_grad, grad_tensor}},
+              {{mock_tensor_3_with_grad, {optimizer_tensor}}}
+            };
+
+            CHECK (result == correct);
+          }
+
+          SUBCASE("allocated") {
+            OptimizerTensorSource optimizer_tensour_source;
+            optimizer_tensor_t optimizer_tensor = optimizer_tensour_source.new_optimizer_tensor();
+            AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{{
+              {TensorTypeVariant{optimizer_tensor}, tensor_backing_3}
+            }, {}, {
+                {mock_tensor_3_with_grad, {optimizer_tensor}}
+            }};
+            GradientTensorSource gradient_tensor_source;
+            UnallocatedTensors result = generate_unallocated_tensors_with_optimizer(allocated_optimizer_tensor, 
+              tensor_attrs_mapping, gradient_tensor_source, optimizer_tensor_source, attrs);
+              
+            GradientTensorSource mock_gradient_tensor_source;
+            gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor();
+            
+            std::unordered_map<TensorTypeVariant, TensorShape> correct_tensor_type_shapes = {
+              {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape},
+              {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape},
+              {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape},
+              {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape},
+            };
+            UnallocatedTensors correct = UnallocatedTensors{
+              correct_tensor_type_shapes,
+              {{mock_tensor_3_with_grad, grad_tensor}},
+              {}
+            };
+
+            CHECK (result == correct);
+          }
+        }
+      }
+      SUBCASE("Adam Attrs") {
+        OptimizerAttrs attrs = OptimizerAttrs{AdamOptimizerAttrs{/*alpha=*/0.001,
+          /*beta1=*/0.9,
+          /*beta2=*/0.999,
+          /*weight_decay=*/0.001,
+          /*alpha_t=*/0.001,
+          /*beta_t=*/0.9,
+          /*beta2_t=*/0.999,
+          /*epsilon=*/1e-8}};
+        SUBCASE("Empty") {
+          AllocatedTensors empty = AllocatedTensors{{}, {}, {}};
+          GradientTensorSource gradient_tensor_source;
+          OptimizerTensorSource optimizer_tensour_source;
+          UnallocatedTensors result = generate_unallocated_tensors_with_optimizer(empty, 
+            tensor_attrs_mapping, gradient_tensor_source, optimizer_tensor_source, attrs);
+            
+          GradientTensorSource mock_gradient_tensor_source;
+          gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor();
+          OptimizerTensorSource mock_optimizer_tensour_source;
+          optimizer_tensor_t optimizer_tensor_1 = mock_optimizer_tensour_source.new_optimizer_tensor();
+          optimizer_tensor_t optimizer_tensor_2 = mock_optimizer_tensour_source.new_optimizer_tensor();
+          
+          std::unordered_map<TensorTypeVariant, TensorShape> correct_tensor_type_shapes = {
+            {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape},
+            {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape},
+            {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape},
+            {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape},
+            {TensorTypeVariant{optimizer_tensor_1}, tensor_attrs_3_with_grad.shape},
+            {TensorTypeVariant{optimizer_tensor_2}, tensor_attrs_3_with_grad.shape},
+          };
+          UnallocatedTensors correct = UnallocatedTensors{
+            correct_tensor_type_shapes,
+            {{mock_tensor_3_with_grad, grad_tensor}},
+            {{mock_tensor_3_with_grad, {optimizer_tensor_1, optimizer_tensor_2}}}
+          };
+
+          CHECK (result == correct);
+        }
+        SUBCASE("Partially allocated") {
+          OptimizerTensorSource optimizer_tensour_source;
+          optimizer_tensor_t optimizer_tensor_1 = optimizer_tensour_source.new_optimizer_tensor();
+          AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{{
+            {TensorTypeVariant{optimizer_tensor_1}, tensor_backing_3}
+          }, {}, {
+              {mock_tensor_3_with_grad, {optimizer_tensor_1}}
+          }};
+          GradientTensorSource gradient_tensor_source;
+          UnallocatedTensors result = generate_unallocated_tensors_with_optimizer(allocated_optimizer_tensor, 
+            tensor_attrs_mapping, gradient_tensor_source, optimizer_tensor_source, attrs);
+            
+          GradientTensorSource mock_gradient_tensor_source;
+          gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor();
+          OptimizerTensorSource mock_optimizer_tensour_source;
+          optimizer_tensor_source.new_optimizer_tensor();
+          optimizer_tensor_t optimizer_tensor_2 = optimizer_tensour_source.new_optimizer_tensor();
+          
+          std::unordered_map<TensorTypeVariant, TensorShape> correct_tensor_type_shapes = {
+            {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape},
+            {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape},
+            {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape},
+            {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape},
+            {TensorTypeVariant{optimizer_tensor_2}, tensor_attrs_3_with_grad.shape},
+          };
+          UnallocatedTensors correct = UnallocatedTensors{
+            correct_tensor_type_shapes,
+            {{mock_tensor_3_with_grad, grad_tensor}},
+            {{mock_tensor_3_with_grad, {optimizer_tensor_2}}}
+          };
+
+          CHECK (result == correct);
+        }
+
+        SUBCASE("Fully allocated") {
+          OptimizerTensorSource optimizer_tensour_source;
+          optimizer_tensor_t optimizer_tensor_1 = optimizer_tensour_source.new_optimizer_tensor();
+          optimizer_tensor_t optimizer_tensor_2 = optimizer_tensour_source.new_optimizer_tensor();
+          AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{{
+            {TensorTypeVariant{optimizer_tensor_1}, tensor_backing_3},
+            {TensorTypeVariant{optimizer_tensor_2}, tensor_backing_3}
+          }, {}, {
+              {mock_tensor_3_with_grad, {optimizer_tensor_1, optimizer_tensor_2}}
+          }};
+          GradientTensorSource gradient_tensor_source;
+          UnallocatedTensors result = generate_unallocated_tensors_with_optimizer(allocated_optimizer_tensor, 
+            tensor_attrs_mapping, gradient_tensor_source, optimizer_tensor_source, attrs);
+            
+          GradientTensorSource mock_gradient_tensor_source;
+          gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor();
+          OptimizerTensorSource mock_optimizer_tensour_source;
+          optimizer_tensor_source.new_optimizer_tensor();
+          optimizer_tensor_t optimizer_tensor_2 = optimizer_tensour_source.new_optimizer_tensor();
+          
+          std::unordered_map<TensorTypeVariant, TensorShape> correct_tensor_type_shapes = {
+            {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape},
+            {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape},
+            {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape},
+            {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape},
+          };
+          UnallocatedTensors correct = UnallocatedTensors{
+            correct_tensor_type_shapes,
+            {{mock_tensor_3_with_grad, grad_tensor}},
+            {}
+          };
+
+          CHECK (result == correct);
+        }
+      }
+    }
+  }
+}
diff --git a/lib/local-execution/test/src/test_utils.cc b/lib/local-execution/test/src/test_utils.cc
index 095e1272a2..b7a4e16b97 100644
--- a/lib/local-execution/test/src/test_utils.cc
+++ b/lib/local-execution/test/src/test_utils.cc
@@ -1,4 +1,5 @@
 #include "test_utils.h"
+#include "pcg/tensor_guid_t.dtg.h"
 
 namespace FlexFlow {
 
@@ -6,4 +7,13 @@ PerDeviceFFHandle get_mock_per_device_ff_handle() {
   return {nullptr, nullptr, nullptr, 0, false};
 }
 
+size_t MockTensorGuidSource::next_available_mock_tensor_guid = 0;
+
+MockTensorGuidSource::MockTensorGuidSource() {}
+
+tensor_guid_t MockTensorGuidSource::new_mock_tensor_guid() {
+  size_t next_guid = MockTensorGuidSource::next_available_mock_tensor_guid++;
+  return tensor_guid_t{DataflowOutput{Node{0}, nonnegative_int{next_guid}}};
+}
+
 } // namespace FlexFlow
diff --git a/lib/local-execution/test/src/test_utils.h b/lib/local-execution/test/src/test_utils.h
index 9a7b3f5991..6d6dcf5afe 100644
--- a/lib/local-execution/test/src/test_utils.h
+++ b/lib/local-execution/test/src/test_utils.h
@@ -5,6 +5,16 @@
 
 namespace FlexFlow {
 
+struct MockTensorGuidSource {
+public:
+  MockTensorGuidSource();
+
+  tensor_guid_t new_mock_tensor_guid();
+
+private:
+  static size_t next_available_mock_tensor_guid;
+};
+
 PerDeviceFFHandle get_mock_per_device_ff_handle();
 
 } // namespace FlexFlow
diff --git a/lib/pcg/include/pcg/computation_graph.h b/lib/pcg/include/pcg/computation_graph.h
index c473ae1f40..589496e61b 100644
--- a/lib/pcg/include/pcg/computation_graph.h
+++ b/lib/pcg/include/pcg/computation_graph.h
@@ -42,6 +42,8 @@ std::vector<tensor_guid_t> get_incoming_weights(ComputationGraph const &,
                                                 layer_guid_t const &);
 
 std::unordered_set<tensor_guid_t> get_all_tensors(ComputationGraph const &);
+std::unordered_map<tensor_guid_t, TensorAttrs>
+    get_all_tensor_attrs(ComputationGraph const &);
 
 std::unordered_set<ComputationGraphEdge>
     get_subgraph_incoming_edges(ComputationGraph const &,
diff --git a/lib/pcg/src/pcg/computation_graph.cc b/lib/pcg/src/pcg/computation_graph.cc
index 74448f18bc..728a150c2a 100644
--- a/lib/pcg/src/pcg/computation_graph.cc
+++ b/lib/pcg/src/pcg/computation_graph.cc
@@ -135,6 +135,16 @@ std::unordered_set<tensor_guid_t> get_all_tensors(ComputationGraph const &cg) {
                    [](DataflowOutput const &t) { return tensor_guid_t(t); });
 }
 
+std::unordered_map<tensor_guid_t, TensorAttrs>
+    get_all_tensor_attrs(ComputationGraph const &cg) {
+  std::unordered_set<tensor_guid_t> all_tensors = get_all_tensors(cg);
+  std::unordered_map<tensor_guid_t, TensorAttrs> all_tensor_attrs;
+  for (tensor_guid_t const &tensor_guid : all_tensors) {
+    all_tensor_attrs.insert({tensor_guid, get_tensor_attrs(cg, tensor_guid)});
+  }
+  return all_tensor_attrs;
+}
+
 std::unordered_set<ComputationGraphEdge> get_subgraph_incoming_edges(
     ComputationGraph const &cg,
     std::unordered_set<layer_guid_t> const &subgraph_nodes) {
diff --git a/lib/utils/include/utils/required_core.h b/lib/utils/include/utils/required_core.h
index 7a7abcd2c4..8ac772439f 100644
--- a/lib/utils/include/utils/required_core.h
+++ b/lib/utils/include/utils/required_core.h
@@ -232,7 +232,7 @@ namespace std {
 template <typename T>
 struct hash<::FlexFlow::req<T>> {
   size_t operator()(::FlexFlow::req<T> const &r) const {
-    return get_std_hash(static_cast<T>(r));
+    return ::FlexFlow::get_std_hash(static_cast<T>(r));
   }
 };
 

From a0f81132754d91f7cacf6250b2fb38c42d58f7fc Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Thu, 13 Feb 2025 12:25:58 -0800
Subject: [PATCH 42/91] Fix nonnegative

---
 lib/local-execution/test/src/test_allocated_tensors.cc   | 6 +++---
 lib/local-execution/test/src/test_unallocated_tensors.cc | 9 +++------
 lib/local-execution/test/src/test_utils.h                | 1 +
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/lib/local-execution/test/src/test_allocated_tensors.cc b/lib/local-execution/test/src/test_allocated_tensors.cc
index 59537cfae1..f4f0664141 100644
--- a/lib/local-execution/test/src/test_allocated_tensors.cc
+++ b/lib/local-execution/test/src/test_allocated_tensors.cc
@@ -30,7 +30,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{
       TensorShape{
         TensorDims{
-          FFOrdered<nonnegative_int>{16, 10}
+          FFOrdered<nonnegative_int>{16_n, 10_n}
         },
         DataType::FLOAT
       },
@@ -41,7 +41,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{
       TensorShape{
         TensorDims{
-          FFOrdered<nonnegative_int>{16, 20}
+          FFOrdered<nonnegative_int>{16_n, 20_n}
         },
         DataType::FLOAT
       },
@@ -52,7 +52,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{
       TensorShape{
         TensorDims{
-          FFOrdered<nonnegative_int>{16, 30}
+          FFOrdered<nonnegative_int>{16_n, 30_n}
         },
         DataType::FLOAT
       },
diff --git a/lib/local-execution/test/src/test_unallocated_tensors.cc b/lib/local-execution/test/src/test_unallocated_tensors.cc
index 9802821f3e..65aabc2043 100644
--- a/lib/local-execution/test/src/test_unallocated_tensors.cc
+++ b/lib/local-execution/test/src/test_unallocated_tensors.cc
@@ -28,7 +28,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{
       TensorShape{
         TensorDims{
-          FFOrdered<nonnegative_int>{16, 10}
+          FFOrdered<nonnegative_int>{16_n, 10_n}
         },
         DataType::FLOAT
       },
@@ -39,7 +39,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{
       TensorShape{
         TensorDims{
-          FFOrdered<nonnegative_int>{16, 20}
+          FFOrdered<nonnegative_int>{16_n, 20_n}
         },
         DataType::FLOAT
       },
@@ -50,7 +50,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{
       TensorShape{
         TensorDims{
-          FFOrdered<nonnegative_int>{16, 30}
+          FFOrdered<nonnegative_int>{16_n, 30_n}
         },
         DataType::FLOAT
       },
@@ -359,9 +359,6 @@ TEST_SUITE(FF_TEST_SUITE) {
             
           GradientTensorSource mock_gradient_tensor_source;
           gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor();
-          OptimizerTensorSource mock_optimizer_tensour_source;
-          optimizer_tensor_source.new_optimizer_tensor();
-          optimizer_tensor_t optimizer_tensor_2 = optimizer_tensour_source.new_optimizer_tensor();
           
           std::unordered_map<TensorTypeVariant, TensorShape> correct_tensor_type_shapes = {
             {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape},
diff --git a/lib/local-execution/test/src/test_utils.h b/lib/local-execution/test/src/test_utils.h
index 6d6dcf5afe..056e92687c 100644
--- a/lib/local-execution/test/src/test_utils.h
+++ b/lib/local-execution/test/src/test_utils.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_LOCAL_EXECUTION_TEST_UTILS
 
 #include "kernels/ff_handle.h"
+#include "pcg/tensor_guid_t.dtg.h"
 
 namespace FlexFlow {
 

From b1eab94bcd7ddf473f65da9b5afa01602115ec5b Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Thu, 13 Feb 2025 12:26:55 -0800
Subject: [PATCH 43/91] Format

---
 .../local-execution/allocated_tensors.h       |   4 +-
 .../local-execution/local_tensor_backing.h    |  21 +-
 lib/local-execution/src/allocated_tensors.cc  |  30 +-
 .../src/local_cost_estimator.cc               |   9 +-
 .../src/local_tensor_backing.cc               |  54 +-
 .../src/local_training_backing.cc             |  12 +-
 .../test/src/test_allocated_tensors.cc        | 286 ++++----
 .../test/src/test_unallocated_tensors.cc      | 623 ++++++++++--------
 8 files changed, 582 insertions(+), 457 deletions(-)

diff --git a/lib/local-execution/include/local-execution/allocated_tensors.h b/lib/local-execution/include/local-execution/allocated_tensors.h
index 0d01350d9f..7581a159ad 100644
--- a/lib/local-execution/include/local-execution/allocated_tensors.h
+++ b/lib/local-execution/include/local-execution/allocated_tensors.h
@@ -16,7 +16,9 @@ bool are_allocated_optimizer_tensors_valid(
     AllocatedTensors const &,
     std::unordered_map<tensor_guid_t, TensorAttrs> const &);
 
-bool are_allocated_tensors_valid(AllocatedTensors const &, std::unordered_map<tensor_guid_t, TensorAttrs> const &);
+bool are_allocated_tensors_valid(
+    AllocatedTensors const &,
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &);
 
 bool is_allocated_tensor_backing_valid(
     TensorTypeVariant const &,
diff --git a/lib/local-execution/include/local-execution/local_tensor_backing.h b/lib/local-execution/include/local-execution/local_tensor_backing.h
index a43f1a2c81..c05e39beae 100644
--- a/lib/local-execution/include/local-execution/local_tensor_backing.h
+++ b/lib/local-execution/include/local-execution/local_tensor_backing.h
@@ -53,16 +53,17 @@ struct LocalTensorBacking {
   LoweredTensorSource lowered_tensor_source;
 };
 
-UnallocatedTensors generate_unallocated_tensors(AllocatedTensors const &,
-                                                std::unordered_map<tensor_guid_t, TensorAttrs> const &,
-                                                GradientTensorSource &);
-
-UnallocatedTensors
-    generate_unallocated_tensors_with_optimizer(AllocatedTensors const &,
-                                                std::unordered_map<tensor_guid_t, TensorAttrs> const &,
-                                                GradientTensorSource &,
-                                                OptimizerTensorSource &,
-                                                OptimizerAttrs const &);
+UnallocatedTensors generate_unallocated_tensors(
+    AllocatedTensors const &,
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &,
+    GradientTensorSource &);
+
+UnallocatedTensors generate_unallocated_tensors_with_optimizer(
+    AllocatedTensors const &,
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &,
+    GradientTensorSource &,
+    OptimizerTensorSource &,
+    OptimizerAttrs const &);
 
 TensorSlotsBacking construct_tensor_slots_backing(LocalTensorBacking const &,
                                                   TaskBinding const &);
diff --git a/lib/local-execution/src/allocated_tensors.cc b/lib/local-execution/src/allocated_tensors.cc
index 19b149e7bd..3e249bf6d1 100644
--- a/lib/local-execution/src/allocated_tensors.cc
+++ b/lib/local-execution/src/allocated_tensors.cc
@@ -24,14 +24,11 @@ bool are_allocated_forward_tensors_valid(
     AllocatedTensors const &allocated_tensors,
     std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs) {
 
-  std::unordered_set<tensor_guid_t> all_tensor_guids =
-      transform(
-          keys(filter_keys(allocated_tensors.tensor_type_backings,
-                           [&](TensorTypeVariant const &k) {
-                             return k.has<tensor_guid_t>();
-                           })),
-          [&](TensorTypeVariant const &t) { return t.get<tensor_guid_t>(); }
-    );
+  std::unordered_set<tensor_guid_t> all_tensor_guids = transform(
+      keys(filter_keys(
+          allocated_tensors.tensor_type_backings,
+          [&](TensorTypeVariant const &k) { return k.has<tensor_guid_t>(); })),
+      [&](TensorTypeVariant const &t) { return t.get<tensor_guid_t>(); });
 
   for (tensor_guid_t const &tensor_guid : all_tensor_guids) {
     if (tensor_attrs.count(tensor_guid)) {
@@ -62,7 +59,8 @@ bool are_allocated_gradient_tensors_valid(
         return false;
       }
 
-      ArrayShape tensor_guid_array_shape = ArrayShape{tensor_attrs.at(tensor_to_grad.first).shape};
+      ArrayShape tensor_guid_array_shape =
+          ArrayShape{tensor_attrs.at(tensor_to_grad.first).shape};
       TensorTypeVariant gradient_tensor =
           TensorTypeVariant{tensor_to_grad.second};
       if (is_allocated_tensor_backing_valid(
@@ -103,7 +101,8 @@ bool are_allocated_optimizer_tensors_valid(
         return false;
       }
 
-      ArrayShape tensor_guid_array_shape = ArrayShape{tensor_attrs.at(tensor_to_optimizers.first).shape};
+      ArrayShape tensor_guid_array_shape =
+          ArrayShape{tensor_attrs.at(tensor_to_optimizers.first).shape};
       for (optimizer_tensor_t const &optimizer_tensor :
            tensor_to_optimizers.second) {
         if (is_allocated_tensor_backing_valid(
@@ -130,10 +129,13 @@ bool are_allocated_optimizer_tensors_valid(
   return true;
 }
 
-bool are_allocated_tensors_valid(AllocatedTensors const & allocated_tensors, std::unordered_map<tensor_guid_t, TensorAttrs> const & tensor_attrs) {
-  return are_allocated_forward_tensors_valid(allocated_tensors, tensor_attrs) 
-    && are_allocated_gradient_tensors_valid(allocated_tensors, tensor_attrs)
-    && are_allocated_optimizer_tensors_valid(allocated_tensors, tensor_attrs);
+bool are_allocated_tensors_valid(
+    AllocatedTensors const &allocated_tensors,
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs) {
+  return are_allocated_forward_tensors_valid(allocated_tensors, tensor_attrs) &&
+         are_allocated_gradient_tensors_valid(allocated_tensors,
+                                              tensor_attrs) &&
+         are_allocated_optimizer_tensors_valid(allocated_tensors, tensor_attrs);
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
index c5c2fafa9d..41a5df8d48 100644
--- a/lib/local-execution/src/local_cost_estimator.cc
+++ b/lib/local-execution/src/local_cost_estimator.cc
@@ -87,11 +87,10 @@ CostDetails LocalCostEstimator::estimate_cost(
       std::make_shared<TrackedAllocator>(create_local_cuda_memory_allocator());
   Allocator allocator = Allocator(tracked_allocator_ptr);
 
-  LocalTrainingBacking local_backing(
-      allocator,
-      AllocatedTensors{{}, {}, {}},
-      computation_graph,
-      this->runtime_arg_config);
+  LocalTrainingBacking local_backing(allocator,
+                                     AllocatedTensors{{}, {}, {}},
+                                     computation_graph,
+                                     this->runtime_arg_config);
 
   // execute layer
   layer_guid_t operator_layer_guid =
diff --git a/lib/local-execution/src/local_tensor_backing.cc b/lib/local-execution/src/local_tensor_backing.cc
index c37cfc5fc4..be84d77906 100644
--- a/lib/local-execution/src/local_tensor_backing.cc
+++ b/lib/local-execution/src/local_tensor_backing.cc
@@ -90,38 +90,39 @@ lowered_tensor_t
 
 GenericTensorAccessorW
     LocalTensorBacking::get_tensor(TensorTypeVariant const &tensor_type) const {
-  lowered_tensor_t lowered_tensor = tensor_type.visit<lowered_tensor_t>(
-      overload{[&](tensor_guid_t const &tensor_guid) {
-                 return this->tensor_lowering_mapping.at(tensor_guid);
-               },
-               [&](gradient_tensor_t const &gradient_tensor) {
-                 return this->gradient_tensor_lowering_mapping.at(gradient_tensor);
-               },
-               [&](optimizer_tensor_t const &optimizer_tensor) {
-                 return this->optimizer_tensor_lowering_mapping.at(optimizer_tensor);
-               },
-               [&](loss_tensor_t const &loss_tensor) {
-                 return this->loss_tensor_lowering_mapping.at(loss_tensor);
-               },
-               [&](auto const &any_tensor) {
-                 throw mk_runtime_error(
-                     fmt::format("Unhandled tensor type {}", any_tensor));
-               }});
+  lowered_tensor_t lowered_tensor =
+      tensor_type.visit<lowered_tensor_t>(overload{
+          [&](tensor_guid_t const &tensor_guid) {
+            return this->tensor_lowering_mapping.at(tensor_guid);
+          },
+          [&](gradient_tensor_t const &gradient_tensor) {
+            return this->gradient_tensor_lowering_mapping.at(gradient_tensor);
+          },
+          [&](optimizer_tensor_t const &optimizer_tensor) {
+            return this->optimizer_tensor_lowering_mapping.at(optimizer_tensor);
+          },
+          [&](loss_tensor_t const &loss_tensor) {
+            return this->loss_tensor_lowering_mapping.at(loss_tensor);
+          },
+          [&](auto const &any_tensor) {
+            throw mk_runtime_error(
+                fmt::format("Unhandled tensor type {}", any_tensor));
+          }});
   return this->tensor_backings.at(lowered_tensor);
 }
 
-UnallocatedTensors
-    generate_unallocated_tensors(AllocatedTensors const &allocated_tensors,
-                                std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs_mapping,
-                                 GradientTensorSource &gradient_tensor_source) {
+UnallocatedTensors generate_unallocated_tensors(
+    AllocatedTensors const &allocated_tensors,
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs_mapping,
+    GradientTensorSource &gradient_tensor_source) {
 
-  assert(are_allocated_tensors_valid(
-      allocated_tensors, tensor_attrs_mapping));
+  assert(are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping));
 
   std::unordered_map<TensorTypeVariant, TensorShape> tensor_type_shapes;
   std::unordered_map<tensor_guid_t, gradient_tensor_t> gradient_mapping;
 
-  for (std::pair<tensor_guid_t, TensorAttrs> const &tensor_guid_attrs : tensor_attrs_mapping) {
+  for (std::pair<tensor_guid_t, TensorAttrs> const &tensor_guid_attrs :
+       tensor_attrs_mapping) {
     tensor_guid_t tensor_guid = tensor_guid_attrs.first;
     TensorAttrs tensor_attrs = tensor_guid_attrs.second;
     TensorTypeVariant tensor_guid_type = TensorTypeVariant{tensor_guid};
@@ -151,7 +152,7 @@ UnallocatedTensors generate_unallocated_tensors_with_optimizer(
 
   UnallocatedTensors unallocated_tensors = generate_unallocated_tensors(
       allocated_tensors, tensor_attrs_mapping, gradient_tensor_source);
-  
+
   if (!get_num_optimizer_tensors(optimizer_attrs)) {
     return unallocated_tensors;
   }
@@ -163,7 +164,8 @@ UnallocatedTensors generate_unallocated_tensors_with_optimizer(
   std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
       optimizer_mapping;
 
-  for (std::pair<tensor_guid_t, TensorAttrs> const &tensor_guid_attrs : tensor_attrs_mapping) {
+  for (std::pair<tensor_guid_t, TensorAttrs> const &tensor_guid_attrs :
+       tensor_attrs_mapping) {
     tensor_guid_t tensor_guid = tensor_guid_attrs.first;
     TensorAttrs tensor_attrs = tensor_guid_attrs.second;
     if (tensor_attrs.create_gradients == CreateGrad::YES &&
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index cb22240b7f..35436a60fd 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -21,12 +21,12 @@ LocalTrainingBacking::LocalTrainingBacking(
     RuntimeArgConfig const &runtime_arg_config)
     : computation_graph(computation_graph),
       task_registry(construct_task_registry(computation_graph)),
-      local_tensor_backing(
-          allocated_tensors,
-          generate_unallocated_tensors(allocated_tensors,
-                                       get_all_tensor_attrs(this->computation_graph),
-                                       this->gradient_tensor_source),
-          allocator),
+      local_tensor_backing(allocated_tensors,
+                           generate_unallocated_tensors(
+                               allocated_tensors,
+                               get_all_tensor_attrs(this->computation_graph),
+                               this->gradient_tensor_source),
+                           allocator),
       local_args_backing(initialize_args_backing(this->task_registry,
                                                  this->computation_graph,
                                                  runtime_arg_config,
diff --git a/lib/local-execution/test/src/test_allocated_tensors.cc b/lib/local-execution/test/src/test_allocated_tensors.cc
index f4f0664141..99abd538d5 100644
--- a/lib/local-execution/test/src/test_allocated_tensors.cc
+++ b/lib/local-execution/test/src/test_allocated_tensors.cc
@@ -1,8 +1,8 @@
 #include "local-execution/allocated_tensors.h"
-#include "local-execution/local_cpu_allocator.h"
 #include "local-execution/gradient_tensor_source.h"
-#include "local-execution/optimizer_tensor_source.h"
+#include "local-execution/local_cpu_allocator.h"
 #include "local-execution/loss_tensor_source.h"
+#include "local-execution/optimizer_tensor_source.h"
 #include "pcg/computation_graph.dtg.h"
 #include "test/utils/doctest/fmt/pair.h"
 #include "test/utils/doctest/fmt/unordered_map.h"
@@ -24,197 +24,211 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     tensor_guid_t mock_tensor_1 = tensor_guid_source.new_mock_tensor_guid();
     tensor_guid_t mock_tensor_2 = tensor_guid_source.new_mock_tensor_guid();
-    tensor_guid_t mock_tensor_3_with_grad = tensor_guid_source.new_mock_tensor_guid();
+    tensor_guid_t mock_tensor_3_with_grad =
+        tensor_guid_source.new_mock_tensor_guid();
     tensor_guid_t dangling_tensor = tensor_guid_source.new_mock_tensor_guid();
 
     TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{
-      TensorShape{
-        TensorDims{
-          FFOrdered<nonnegative_int>{16_n, 10_n}
-        },
-        DataType::FLOAT
-      },
-      std::nullopt,
-      std::nullopt,
-      CreateGrad::NO
-    };
+        TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 10_n}},
+                    DataType::FLOAT},
+        std::nullopt,
+        std::nullopt,
+        CreateGrad::NO};
     TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{
-      TensorShape{
-        TensorDims{
-          FFOrdered<nonnegative_int>{16_n, 20_n}
-        },
-        DataType::FLOAT
-      },
-      std::nullopt,
-      std::nullopt,
-      CreateGrad::NO
-    };
+        TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 20_n}},
+                    DataType::FLOAT},
+        std::nullopt,
+        std::nullopt,
+        CreateGrad::NO};
     TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{
-      TensorShape{
-        TensorDims{
-          FFOrdered<nonnegative_int>{16_n, 30_n}
-        },
-        DataType::FLOAT
-      },
-      std::nullopt,
-      std::nullopt,
-      CreateGrad::YES
-    };
+        TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 30_n}},
+                    DataType::FLOAT},
+        std::nullopt,
+        std::nullopt,
+        CreateGrad::YES};
 
-    GenericTensorAccessorW tensor_backing_1 = allocator.allocate_tensor(tensor_attrs_1_no_grad.shape);
-    GenericTensorAccessorW tensor_backing_2 = allocator.allocate_tensor(tensor_attrs_2_no_grad.shape);
-    GenericTensorAccessorW tensor_backing_3 = allocator.allocate_tensor(tensor_attrs_3_with_grad.shape);
+    GenericTensorAccessorW tensor_backing_1 =
+        allocator.allocate_tensor(tensor_attrs_1_no_grad.shape);
+    GenericTensorAccessorW tensor_backing_2 =
+        allocator.allocate_tensor(tensor_attrs_2_no_grad.shape);
+    GenericTensorAccessorW tensor_backing_3 =
+        allocator.allocate_tensor(tensor_attrs_3_with_grad.shape);
 
     std::unordered_map<tensor_guid_t, TensorAttrs> tensor_attrs_mapping = {
-      {mock_tensor_1, tensor_attrs_1_no_grad},
-      {mock_tensor_2, tensor_attrs_2_no_grad},
-      {mock_tensor_3_with_grad, tensor_attrs_3_with_grad},
+        {mock_tensor_1, tensor_attrs_1_no_grad},
+        {mock_tensor_2, tensor_attrs_2_no_grad},
+        {mock_tensor_3_with_grad, tensor_attrs_3_with_grad},
     };
 
     SUBCASE("Trivial tensors") {
       SUBCASE("Empty") {
         AllocatedTensors allocated_tensors = AllocatedTensors{{}, {}, {}};
-        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
-        CHECK (result == true);
+        bool result = are_allocated_tensors_valid(allocated_tensors,
+                                                  tensor_attrs_mapping);
+        CHECK(result == true);
       }
-  
+
       SUBCASE("Loss tensor") {
         loss_tensor_t loss_tensor = loss_tensor_source.new_loss_tensor();
-        AllocatedTensors allocated_tensors = AllocatedTensors{{
-          {TensorTypeVariant{loss_tensor}, tensor_backing_1}
-        }, {}, {}};
-        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
-        CHECK (result == true);
+        AllocatedTensors allocated_tensors = AllocatedTensors{
+            {{TensorTypeVariant{loss_tensor}, tensor_backing_1}}, {}, {}};
+        bool result = are_allocated_tensors_valid(allocated_tensors,
+                                                  tensor_attrs_mapping);
+        CHECK(result == true);
       }
     }
 
     SUBCASE("Forward tensors") {
       SUBCASE("Correct forward tensor") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{{
-          {TensorTypeVariant{mock_tensor_1}, tensor_backing_1}
-        }, {}, {}};
-        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
-        CHECK (result == true);
+        AllocatedTensors allocated_tensors = AllocatedTensors{
+            {{TensorTypeVariant{mock_tensor_1}, tensor_backing_1}}, {}, {}};
+        bool result = are_allocated_tensors_valid(allocated_tensors,
+                                                  tensor_attrs_mapping);
+        CHECK(result == true);
       }
-  
+
       SUBCASE("Incorrect forward tensor") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{{
-          {TensorTypeVariant{mock_tensor_1}, tensor_backing_2}
-        }, {}, {}};
-        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
-        CHECK (result == false);
+        AllocatedTensors allocated_tensors = AllocatedTensors{
+            {{TensorTypeVariant{mock_tensor_1}, tensor_backing_2}}, {}, {}};
+        bool result = are_allocated_tensors_valid(allocated_tensors,
+                                                  tensor_attrs_mapping);
+        CHECK(result == false);
       }
-  
+
       SUBCASE("Dangling tensor guid") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{{
-          {TensorTypeVariant{dangling_tensor}, tensor_backing_1},
-        }, {}, {}};
-        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
-        CHECK (result == false);
+        AllocatedTensors allocated_tensors = AllocatedTensors{
+            {
+                {TensorTypeVariant{dangling_tensor}, tensor_backing_1},
+            },
+            {},
+            {}};
+        bool result = are_allocated_tensors_valid(allocated_tensors,
+                                                  tensor_attrs_mapping);
+        CHECK(result == false);
       }
     }
-    
+
     SUBCASE("Gradient tensors") {
-      gradient_tensor_t grad_tensor_3 = gradient_tensor_source.new_gradient_tensor();
+      gradient_tensor_t grad_tensor_3 =
+          gradient_tensor_source.new_gradient_tensor();
 
       SUBCASE("Gradient tensor") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{{
-          {TensorTypeVariant{grad_tensor_3}, tensor_backing_3}
-        }, {{mock_tensor_3_with_grad, grad_tensor_3}}, {}};
-        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
-        CHECK (result == true);
+        AllocatedTensors allocated_tensors = AllocatedTensors{
+            {{TensorTypeVariant{grad_tensor_3}, tensor_backing_3}},
+            {{mock_tensor_3_with_grad, grad_tensor_3}},
+            {}};
+        bool result = are_allocated_tensors_valid(allocated_tensors,
+                                                  tensor_attrs_mapping);
+        CHECK(result == true);
       }
-  
+
       SUBCASE("Dangling gradient tensor") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{{
-          {TensorTypeVariant{grad_tensor_3}, tensor_backing_3}
-        }, {}, {}};
-        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
-        CHECK (result == false);
+        AllocatedTensors allocated_tensors = AllocatedTensors{
+            {{TensorTypeVariant{grad_tensor_3}, tensor_backing_3}}, {}, {}};
+        bool result = are_allocated_tensors_valid(allocated_tensors,
+                                                  tensor_attrs_mapping);
+        CHECK(result == false);
       }
 
       SUBCASE("Dangling gradient tensor in mapping") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{{}, {
-          {mock_tensor_3_with_grad, grad_tensor_3}
-        }, {}};
-        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
-        CHECK (result == false);
+        AllocatedTensors allocated_tensors = AllocatedTensors{
+            {}, {{mock_tensor_3_with_grad, grad_tensor_3}}, {}};
+        bool result = are_allocated_tensors_valid(allocated_tensors,
+                                                  tensor_attrs_mapping);
+        CHECK(result == false);
       }
-  
+
       SUBCASE("Gradient allocated for forward tensor without gradient") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{{
-          {TensorTypeVariant{grad_tensor_3}, tensor_backing_3}
-        }, {{mock_tensor_2, grad_tensor_3}}, {}};
-        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
-        CHECK (result == false);
+        AllocatedTensors allocated_tensors = AllocatedTensors{
+            {{TensorTypeVariant{grad_tensor_3}, tensor_backing_3}},
+            {{mock_tensor_2, grad_tensor_3}},
+            {}};
+        bool result = are_allocated_tensors_valid(allocated_tensors,
+                                                  tensor_attrs_mapping);
+        CHECK(result == false);
       }
-  
+
       SUBCASE("Gradient tensor with wrong shape") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{{
-          {TensorTypeVariant{grad_tensor_3}, tensor_backing_2}
-        }, {{mock_tensor_3_with_grad, grad_tensor_3}}, {}};
-        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
-        CHECK (result == false);
+        AllocatedTensors allocated_tensors = AllocatedTensors{
+            {{TensorTypeVariant{grad_tensor_3}, tensor_backing_2}},
+            {{mock_tensor_3_with_grad, grad_tensor_3}},
+            {}};
+        bool result = are_allocated_tensors_valid(allocated_tensors,
+                                                  tensor_attrs_mapping);
+        CHECK(result == false);
       }
-  
+
       SUBCASE("Gradient tensor with dangling tensor guid") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{{
-          {TensorTypeVariant{grad_tensor_3}, tensor_backing_3}
-        }, {{dangling_tensor, grad_tensor_3}}, {}};
-        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
-        CHECK (result == false);
+        AllocatedTensors allocated_tensors = AllocatedTensors{
+            {{TensorTypeVariant{grad_tensor_3}, tensor_backing_3}},
+            {{dangling_tensor, grad_tensor_3}},
+            {}};
+        bool result = are_allocated_tensors_valid(allocated_tensors,
+                                                  tensor_attrs_mapping);
+        CHECK(result == false);
       }
     }
-    
+
     SUBCASE("Optimizer tensors") {
-      optimizer_tensor_t optimizer_tensor_3 = optimizer_tensor_source.new_optimizer_tensor();
+      optimizer_tensor_t optimizer_tensor_3 =
+          optimizer_tensor_source.new_optimizer_tensor();
 
       SUBCASE("Optimizer tensor") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{{
-          {TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3}
-        }, {}, {{mock_tensor_3_with_grad, {optimizer_tensor_3}}}};
-        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
-        CHECK (result == true);
+        AllocatedTensors allocated_tensors = AllocatedTensors{
+            {{TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3}},
+            {},
+            {{mock_tensor_3_with_grad, {optimizer_tensor_3}}}};
+        bool result = are_allocated_tensors_valid(allocated_tensors,
+                                                  tensor_attrs_mapping);
+        CHECK(result == true);
       }
-  
+
       SUBCASE("Dangling optimizer tensor") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{{
-          {TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3}
-        }, {}, {}};
-        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
-        CHECK (result == false);
+        AllocatedTensors allocated_tensors = AllocatedTensors{
+            {{TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3}},
+            {},
+            {}};
+        bool result = are_allocated_tensors_valid(allocated_tensors,
+                                                  tensor_attrs_mapping);
+        CHECK(result == false);
       }
 
       SUBCASE("Dangling optimizer tensor in mapping") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{{}, {}, {
-          {mock_tensor_3_with_grad, {optimizer_tensor_3}}
-        }};
-        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
-        CHECK (result == false);
+        AllocatedTensors allocated_tensors = AllocatedTensors{
+            {}, {}, {{mock_tensor_3_with_grad, {optimizer_tensor_3}}}};
+        bool result = are_allocated_tensors_valid(allocated_tensors,
+                                                  tensor_attrs_mapping);
+        CHECK(result == false);
       }
-  
+
       SUBCASE("Optimizer allocated for forward tensor without gradient") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{{
-          {TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3}
-        }, {}, {{mock_tensor_2, {optimizer_tensor_3}}}};
-        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
-        CHECK (result == false);
+        AllocatedTensors allocated_tensors = AllocatedTensors{
+            {{TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3}},
+            {},
+            {{mock_tensor_2, {optimizer_tensor_3}}}};
+        bool result = are_allocated_tensors_valid(allocated_tensors,
+                                                  tensor_attrs_mapping);
+        CHECK(result == false);
       }
-  
+
       SUBCASE("Optimizer tensor with wrong shape") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{{
-          {TensorTypeVariant{optimizer_tensor_3}, tensor_backing_2}
-        }, {}, {{mock_tensor_3_with_grad, {optimizer_tensor_3}}}};
-        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
-        CHECK (result == false);
+        AllocatedTensors allocated_tensors = AllocatedTensors{
+            {{TensorTypeVariant{optimizer_tensor_3}, tensor_backing_2}},
+            {},
+            {{mock_tensor_3_with_grad, {optimizer_tensor_3}}}};
+        bool result = are_allocated_tensors_valid(allocated_tensors,
+                                                  tensor_attrs_mapping);
+        CHECK(result == false);
       }
-  
+
       SUBCASE("Optimizer tensor with dangling tensor guid") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{{
-          {TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3}
-        }, {}, {{dangling_tensor, {optimizer_tensor_3}}}};
-        bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping);
-        CHECK (result == false);
+        AllocatedTensors allocated_tensors = AllocatedTensors{
+            {{TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3}},
+            {},
+            {{dangling_tensor, {optimizer_tensor_3}}}};
+        bool result = are_allocated_tensors_valid(allocated_tensors,
+                                                  tensor_attrs_mapping);
+        CHECK(result == false);
       }
     }
   }
diff --git a/lib/local-execution/test/src/test_unallocated_tensors.cc b/lib/local-execution/test/src/test_unallocated_tensors.cc
index 65aabc2043..ddad7f4574 100644
--- a/lib/local-execution/test/src/test_unallocated_tensors.cc
+++ b/lib/local-execution/test/src/test_unallocated_tensors.cc
@@ -1,9 +1,9 @@
 #include "local-execution/allocated_tensors.h"
-#include "local-execution/local_tensor_backing.h"
-#include "local-execution/local_cpu_allocator.h"
 #include "local-execution/gradient_tensor_source.h"
-#include "local-execution/optimizer_tensor_source.h"
+#include "local-execution/local_cpu_allocator.h"
+#include "local-execution/local_tensor_backing.h"
 #include "local-execution/loss_tensor_source.h"
+#include "local-execution/optimizer_tensor_source.h"
 #include "pcg/computation_graph.dtg.h"
 #include "test/utils/doctest/fmt/pair.h"
 #include "test/utils/doctest/fmt/unordered_map.h"
@@ -23,173 +23,208 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     tensor_guid_t mock_tensor_1 = tensor_guid_source.new_mock_tensor_guid();
     tensor_guid_t mock_tensor_2 = tensor_guid_source.new_mock_tensor_guid();
-    tensor_guid_t mock_tensor_3_with_grad = tensor_guid_source.new_mock_tensor_guid();
+    tensor_guid_t mock_tensor_3_with_grad =
+        tensor_guid_source.new_mock_tensor_guid();
 
     TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{
-      TensorShape{
-        TensorDims{
-          FFOrdered<nonnegative_int>{16_n, 10_n}
-        },
-        DataType::FLOAT
-      },
-      std::nullopt,
-      std::nullopt,
-      CreateGrad::NO
-    };
+        TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 10_n}},
+                    DataType::FLOAT},
+        std::nullopt,
+        std::nullopt,
+        CreateGrad::NO};
     TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{
-      TensorShape{
-        TensorDims{
-          FFOrdered<nonnegative_int>{16_n, 20_n}
-        },
-        DataType::FLOAT
-      },
-      std::nullopt,
-      std::nullopt,
-      CreateGrad::NO
-    };
+        TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 20_n}},
+                    DataType::FLOAT},
+        std::nullopt,
+        std::nullopt,
+        CreateGrad::NO};
     TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{
-      TensorShape{
-        TensorDims{
-          FFOrdered<nonnegative_int>{16_n, 30_n}
-        },
-        DataType::FLOAT
-      },
-      std::nullopt,
-      std::nullopt,
-      CreateGrad::YES
-    };
+        TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 30_n}},
+                    DataType::FLOAT},
+        std::nullopt,
+        std::nullopt,
+        CreateGrad::YES};
 
-    GenericTensorAccessorW tensor_backing_1 = allocator.allocate_tensor(tensor_attrs_1_no_grad.shape);
-    GenericTensorAccessorW tensor_backing_2 = allocator.allocate_tensor(tensor_attrs_2_no_grad.shape);
-    GenericTensorAccessorW tensor_backing_3 = allocator.allocate_tensor(tensor_attrs_3_with_grad.shape);
+    GenericTensorAccessorW tensor_backing_1 =
+        allocator.allocate_tensor(tensor_attrs_1_no_grad.shape);
+    GenericTensorAccessorW tensor_backing_2 =
+        allocator.allocate_tensor(tensor_attrs_2_no_grad.shape);
+    GenericTensorAccessorW tensor_backing_3 =
+        allocator.allocate_tensor(tensor_attrs_3_with_grad.shape);
 
     std::unordered_map<tensor_guid_t, TensorAttrs> tensor_attrs_mapping = {
-      {mock_tensor_1, tensor_attrs_1_no_grad},
-      {mock_tensor_2, tensor_attrs_2_no_grad},
-      {mock_tensor_3_with_grad, tensor_attrs_3_with_grad},
+        {mock_tensor_1, tensor_attrs_1_no_grad},
+        {mock_tensor_2, tensor_attrs_2_no_grad},
+        {mock_tensor_3_with_grad, tensor_attrs_3_with_grad},
     };
 
     SUBCASE("Without optimizer") {
       SUBCASE("AllocatedTensors is empty") {
         AllocatedTensors empty = AllocatedTensors{{}, {}, {}};
         GradientTensorSource gradient_tensor_source;
-        UnallocatedTensors result = generate_unallocated_tensors(empty, tensor_attrs_mapping, gradient_tensor_source);
-        
+        UnallocatedTensors result = generate_unallocated_tensors(
+            empty, tensor_attrs_mapping, gradient_tensor_source);
+
         GradientTensorSource mock_gradient_tensor_source;
-        gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor();
-        std::unordered_map<TensorTypeVariant, TensorShape> correct_tensor_type_shapes = {
-          {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape},
-          {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape},
-          {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape},
-          {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape},
-        };
-        UnallocatedTensors correct = UnallocatedTensors{
-          correct_tensor_type_shapes,
-          {{mock_tensor_3_with_grad, grad_tensor}},
-          {}
-        };
-        CHECK (result == correct);
+        gradient_tensor_t grad_tensor =
+            mock_gradient_tensor_source.new_gradient_tensor();
+        std::unordered_map<TensorTypeVariant, TensorShape>
+            correct_tensor_type_shapes = {
+                {TensorTypeVariant{mock_tensor_1},
+                 tensor_attrs_1_no_grad.shape},
+                {TensorTypeVariant{mock_tensor_2},
+                 tensor_attrs_2_no_grad.shape},
+                {TensorTypeVariant{mock_tensor_3_with_grad},
+                 tensor_attrs_3_with_grad.shape},
+                {TensorTypeVariant{grad_tensor},
+                 tensor_attrs_3_with_grad.shape},
+            };
+        UnallocatedTensors correct =
+            UnallocatedTensors{correct_tensor_type_shapes,
+                               {{mock_tensor_3_with_grad, grad_tensor}},
+                               {}};
+        CHECK(result == correct);
       }
 
       SUBCASE("AllocatedTensors contains only 1 forward tensor") {
-        AllocatedTensors allocated_forward_tensors = AllocatedTensors{{
-          {TensorTypeVariant{mock_tensor_1}, tensor_backing_1},
-        }, {}, {}};
+        AllocatedTensors allocated_forward_tensors = AllocatedTensors{
+            {
+                {TensorTypeVariant{mock_tensor_1}, tensor_backing_1},
+            },
+            {},
+            {}};
         GradientTensorSource gradient_tensor_source;
-        UnallocatedTensors result = generate_unallocated_tensors(allocated_forward_tensors, tensor_attrs_mapping, gradient_tensor_source);
-        
+        UnallocatedTensors result =
+            generate_unallocated_tensors(allocated_forward_tensors,
+                                         tensor_attrs_mapping,
+                                         gradient_tensor_source);
+
         GradientTensorSource mock_gradient_tensor_source;
-        gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor();
-        std::unordered_map<TensorTypeVariant, TensorShape> correct_tensor_type_shapes = {
-          {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape},
-          {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape},
-          {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape},
-        };
-        UnallocatedTensors correct = UnallocatedTensors{
-          correct_tensor_type_shapes,
-          {{mock_tensor_3_with_grad, grad_tensor}},
-          {}
-        };
-        CHECK (result == correct);
+        gradient_tensor_t grad_tensor =
+            mock_gradient_tensor_source.new_gradient_tensor();
+        std::unordered_map<TensorTypeVariant, TensorShape>
+            correct_tensor_type_shapes = {
+                {TensorTypeVariant{mock_tensor_2},
+                 tensor_attrs_2_no_grad.shape},
+                {TensorTypeVariant{mock_tensor_3_with_grad},
+                 tensor_attrs_3_with_grad.shape},
+                {TensorTypeVariant{grad_tensor},
+                 tensor_attrs_3_with_grad.shape},
+            };
+        UnallocatedTensors correct =
+            UnallocatedTensors{correct_tensor_type_shapes,
+                               {{mock_tensor_3_with_grad, grad_tensor}},
+                               {}};
+        CHECK(result == correct);
       }
 
       SUBCASE("AllocatedTensors contains only forward tensors") {
-        AllocatedTensors allocated_forward_tensors = AllocatedTensors{{
-          {TensorTypeVariant{mock_tensor_1}, tensor_backing_1},
-          {TensorTypeVariant{mock_tensor_2}, tensor_backing_2},
-          {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_backing_3},
-        }, {}, {}};
+        AllocatedTensors allocated_forward_tensors = AllocatedTensors{
+            {
+                {TensorTypeVariant{mock_tensor_1}, tensor_backing_1},
+                {TensorTypeVariant{mock_tensor_2}, tensor_backing_2},
+                {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_backing_3},
+            },
+            {},
+            {}};
         GradientTensorSource gradient_tensor_source;
-        UnallocatedTensors result = generate_unallocated_tensors(allocated_forward_tensors, tensor_attrs_mapping, gradient_tensor_source);
-        
+        UnallocatedTensors result =
+            generate_unallocated_tensors(allocated_forward_tensors,
+                                         tensor_attrs_mapping,
+                                         gradient_tensor_source);
+
         GradientTensorSource mock_gradient_tensor_source;
-        gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor();
-        std::unordered_map<TensorTypeVariant, TensorShape> correct_tensor_type_shapes = {
-          {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape},
-        };
-        UnallocatedTensors correct = UnallocatedTensors{
-          correct_tensor_type_shapes,
-          {{mock_tensor_3_with_grad, grad_tensor}},
-          {}
-        };
-        CHECK (result == correct);
+        gradient_tensor_t grad_tensor =
+            mock_gradient_tensor_source.new_gradient_tensor();
+        std::unordered_map<TensorTypeVariant, TensorShape>
+            correct_tensor_type_shapes = {
+                {TensorTypeVariant{grad_tensor},
+                 tensor_attrs_3_with_grad.shape},
+            };
+        UnallocatedTensors correct =
+            UnallocatedTensors{correct_tensor_type_shapes,
+                               {{mock_tensor_3_with_grad, grad_tensor}},
+                               {}};
+        CHECK(result == correct);
       }
 
       SUBCASE("AllocatedTensors contains only gradient tensor") {
         GradientTensorSource gradient_tensor_source;
-        gradient_tensor_t grad_tensor = gradient_tensor_source.new_gradient_tensor();
-        AllocatedTensors allocated_forward_tensors = AllocatedTensors{{
-          {TensorTypeVariant{grad_tensor}, tensor_backing_3},
-        }, {{mock_tensor_3_with_grad, grad_tensor}}, {}};
-        UnallocatedTensors result = generate_unallocated_tensors(allocated_forward_tensors, tensor_attrs_mapping, gradient_tensor_source);
-        
-        std::unordered_map<TensorTypeVariant, TensorShape> correct_tensor_type_shapes = {
-          {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape},
-          {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape},
-          {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape},
-        };
-        UnallocatedTensors correct = UnallocatedTensors{
-          correct_tensor_type_shapes,
-          {},
-          {}
-        };
-        CHECK (result == correct);
+        gradient_tensor_t grad_tensor =
+            gradient_tensor_source.new_gradient_tensor();
+        AllocatedTensors allocated_forward_tensors = AllocatedTensors{
+            {
+                {TensorTypeVariant{grad_tensor}, tensor_backing_3},
+            },
+            {{mock_tensor_3_with_grad, grad_tensor}},
+            {}};
+        UnallocatedTensors result =
+            generate_unallocated_tensors(allocated_forward_tensors,
+                                         tensor_attrs_mapping,
+                                         gradient_tensor_source);
+
+        std::unordered_map<TensorTypeVariant, TensorShape>
+            correct_tensor_type_shapes = {
+                {TensorTypeVariant{mock_tensor_1},
+                 tensor_attrs_1_no_grad.shape},
+                {TensorTypeVariant{mock_tensor_2},
+                 tensor_attrs_2_no_grad.shape},
+                {TensorTypeVariant{mock_tensor_3_with_grad},
+                 tensor_attrs_3_with_grad.shape},
+            };
+        UnallocatedTensors correct =
+            UnallocatedTensors{correct_tensor_type_shapes, {}, {}};
+        CHECK(result == correct);
       }
 
       SUBCASE("AllocatedTensors contains mixture") {
         GradientTensorSource gradient_tensor_source;
-        gradient_tensor_t grad_tensor = gradient_tensor_source.new_gradient_tensor();
-        AllocatedTensors allocated_forward_tensors = AllocatedTensors{{
-          {TensorTypeVariant{mock_tensor_1}, tensor_backing_1},
-          {TensorTypeVariant{grad_tensor}, tensor_backing_3},
-        }, {{mock_tensor_3_with_grad, grad_tensor}}, {}};
-        UnallocatedTensors result = generate_unallocated_tensors(allocated_forward_tensors, tensor_attrs_mapping, gradient_tensor_source);
-        
-        std::unordered_map<TensorTypeVariant, TensorShape> correct_tensor_type_shapes = {
-          {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape},
-          {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape},
-        };
-        UnallocatedTensors correct = UnallocatedTensors{
-          correct_tensor_type_shapes,
-          {},
-          {}
-        };
-        CHECK (result == correct);
+        gradient_tensor_t grad_tensor =
+            gradient_tensor_source.new_gradient_tensor();
+        AllocatedTensors allocated_forward_tensors = AllocatedTensors{
+            {
+                {TensorTypeVariant{mock_tensor_1}, tensor_backing_1},
+                {TensorTypeVariant{grad_tensor}, tensor_backing_3},
+            },
+            {{mock_tensor_3_with_grad, grad_tensor}},
+            {}};
+        UnallocatedTensors result =
+            generate_unallocated_tensors(allocated_forward_tensors,
+                                         tensor_attrs_mapping,
+                                         gradient_tensor_source);
+
+        std::unordered_map<TensorTypeVariant, TensorShape>
+            correct_tensor_type_shapes = {
+                {TensorTypeVariant{mock_tensor_2},
+                 tensor_attrs_2_no_grad.shape},
+                {TensorTypeVariant{mock_tensor_3_with_grad},
+                 tensor_attrs_3_with_grad.shape},
+            };
+        UnallocatedTensors correct =
+            UnallocatedTensors{correct_tensor_type_shapes, {}, {}};
+        CHECK(result == correct);
       }
 
       SUBCASE("Fully AllocatedTensors") {
         GradientTensorSource gradient_tensor_source;
-        gradient_tensor_t grad_tensor = gradient_tensor_source.new_gradient_tensor();
-        AllocatedTensors allocated_forward_tensors = AllocatedTensors{{
-          {TensorTypeVariant{mock_tensor_1}, tensor_backing_1},
-          {TensorTypeVariant{mock_tensor_2}, tensor_backing_2},
-          {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_backing_3},
-          {TensorTypeVariant{grad_tensor}, tensor_backing_3},
-        }, {{mock_tensor_3_with_grad, grad_tensor}}, {}};
-        UnallocatedTensors result = generate_unallocated_tensors(allocated_forward_tensors, tensor_attrs_mapping, gradient_tensor_source);
-        
+        gradient_tensor_t grad_tensor =
+            gradient_tensor_source.new_gradient_tensor();
+        AllocatedTensors allocated_forward_tensors = AllocatedTensors{
+            {
+                {TensorTypeVariant{mock_tensor_1}, tensor_backing_1},
+                {TensorTypeVariant{mock_tensor_2}, tensor_backing_2},
+                {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_backing_3},
+                {TensorTypeVariant{grad_tensor}, tensor_backing_3},
+            },
+            {{mock_tensor_3_with_grad, grad_tensor}},
+            {}};
+        UnallocatedTensors result =
+            generate_unallocated_tensors(allocated_forward_tensors,
+                                         tensor_attrs_mapping,
+                                         gradient_tensor_source);
+
         UnallocatedTensors correct = UnallocatedTensors{{}, {}, {}};
-        CHECK (result == correct);
+        CHECK(result == correct);
       }
     }
 
@@ -197,182 +232,252 @@ TEST_SUITE(FF_TEST_SUITE) {
       SUBCASE("SGD Attrs") {
         SUBCASE("without momentum") {
           double momentum = 0.0;
-          OptimizerAttrs attrs = OptimizerAttrs{SGDOptimizerAttrs{0.0, momentum, false, 0.0}};
+          OptimizerAttrs attrs =
+              OptimizerAttrs{SGDOptimizerAttrs{0.0, momentum, false, 0.0}};
           AllocatedTensors empty = AllocatedTensors{{}, {}, {}};
           GradientTensorSource gradient_tensor_source;
           OptimizerTensorSource optimizer_tensour_source;
-          UnallocatedTensors result = generate_unallocated_tensors_with_optimizer(empty, 
-            tensor_attrs_mapping, gradient_tensor_source, optimizer_tensor_source, attrs);
-            
+          UnallocatedTensors result =
+              generate_unallocated_tensors_with_optimizer(
+                  empty,
+                  tensor_attrs_mapping,
+                  gradient_tensor_source,
+                  optimizer_tensor_source,
+                  attrs);
+
           GradientTensorSource mock_gradient_tensor_source;
-          UnallocatedTensors correct = generate_unallocated_tensors(empty, tensor_attrs_mapping, mock_gradient_tensor_source);
-          CHECK (result == correct);
+          UnallocatedTensors correct = generate_unallocated_tensors(
+              empty, tensor_attrs_mapping, mock_gradient_tensor_source);
+          CHECK(result == correct);
         }
         SUBCASE("with momentum") {
           double momentum = 0.9;
-          OptimizerAttrs attrs = OptimizerAttrs{SGDOptimizerAttrs{0.0, momentum, false, 0.0}};
+          OptimizerAttrs attrs =
+              OptimizerAttrs{SGDOptimizerAttrs{0.0, momentum, false, 0.0}};
 
           SUBCASE("unallocated") {
             AllocatedTensors empty = AllocatedTensors{{}, {}, {}};
             GradientTensorSource gradient_tensor_source;
             OptimizerTensorSource optimizer_tensour_source;
-            UnallocatedTensors result = generate_unallocated_tensors_with_optimizer(empty, 
-              tensor_attrs_mapping, gradient_tensor_source, optimizer_tensor_source, attrs);
-              
+            UnallocatedTensors result =
+                generate_unallocated_tensors_with_optimizer(
+                    empty,
+                    tensor_attrs_mapping,
+                    gradient_tensor_source,
+                    optimizer_tensor_source,
+                    attrs);
+
             GradientTensorSource mock_gradient_tensor_source;
-            gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor();
+            gradient_tensor_t grad_tensor =
+                mock_gradient_tensor_source.new_gradient_tensor();
             OptimizerTensorSource mock_optimizer_tensour_source;
-            optimizer_tensor_t optimizer_tensor = mock_optimizer_tensour_source.new_optimizer_tensor();
-            
-            std::unordered_map<TensorTypeVariant, TensorShape> correct_tensor_type_shapes = {
-              {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape},
-              {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape},
-              {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape},
-              {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape},
-              {TensorTypeVariant{optimizer_tensor}, tensor_attrs_3_with_grad.shape},
-            };
+            optimizer_tensor_t optimizer_tensor =
+                mock_optimizer_tensour_source.new_optimizer_tensor();
+
+            std::unordered_map<TensorTypeVariant, TensorShape>
+                correct_tensor_type_shapes = {
+                    {TensorTypeVariant{mock_tensor_1},
+                     tensor_attrs_1_no_grad.shape},
+                    {TensorTypeVariant{mock_tensor_2},
+                     tensor_attrs_2_no_grad.shape},
+                    {TensorTypeVariant{mock_tensor_3_with_grad},
+                     tensor_attrs_3_with_grad.shape},
+                    {TensorTypeVariant{grad_tensor},
+                     tensor_attrs_3_with_grad.shape},
+                    {TensorTypeVariant{optimizer_tensor},
+                     tensor_attrs_3_with_grad.shape},
+                };
             UnallocatedTensors correct = UnallocatedTensors{
-              correct_tensor_type_shapes,
-              {{mock_tensor_3_with_grad, grad_tensor}},
-              {{mock_tensor_3_with_grad, {optimizer_tensor}}}
-            };
+                correct_tensor_type_shapes,
+                {{mock_tensor_3_with_grad, grad_tensor}},
+                {{mock_tensor_3_with_grad, {optimizer_tensor}}}};
 
-            CHECK (result == correct);
+            CHECK(result == correct);
           }
 
           SUBCASE("allocated") {
             OptimizerTensorSource optimizer_tensour_source;
-            optimizer_tensor_t optimizer_tensor = optimizer_tensour_source.new_optimizer_tensor();
-            AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{{
-              {TensorTypeVariant{optimizer_tensor}, tensor_backing_3}
-            }, {}, {
-                {mock_tensor_3_with_grad, {optimizer_tensor}}
-            }};
+            optimizer_tensor_t optimizer_tensor =
+                optimizer_tensour_source.new_optimizer_tensor();
+            AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{
+                {{TensorTypeVariant{optimizer_tensor}, tensor_backing_3}},
+                {},
+                {{mock_tensor_3_with_grad, {optimizer_tensor}}}};
             GradientTensorSource gradient_tensor_source;
-            UnallocatedTensors result = generate_unallocated_tensors_with_optimizer(allocated_optimizer_tensor, 
-              tensor_attrs_mapping, gradient_tensor_source, optimizer_tensor_source, attrs);
-              
+            UnallocatedTensors result =
+                generate_unallocated_tensors_with_optimizer(
+                    allocated_optimizer_tensor,
+                    tensor_attrs_mapping,
+                    gradient_tensor_source,
+                    optimizer_tensor_source,
+                    attrs);
+
             GradientTensorSource mock_gradient_tensor_source;
-            gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor();
-            
-            std::unordered_map<TensorTypeVariant, TensorShape> correct_tensor_type_shapes = {
-              {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape},
-              {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape},
-              {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape},
-              {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape},
-            };
-            UnallocatedTensors correct = UnallocatedTensors{
-              correct_tensor_type_shapes,
-              {{mock_tensor_3_with_grad, grad_tensor}},
-              {}
-            };
+            gradient_tensor_t grad_tensor =
+                mock_gradient_tensor_source.new_gradient_tensor();
+
+            std::unordered_map<TensorTypeVariant, TensorShape>
+                correct_tensor_type_shapes = {
+                    {TensorTypeVariant{mock_tensor_1},
+                     tensor_attrs_1_no_grad.shape},
+                    {TensorTypeVariant{mock_tensor_2},
+                     tensor_attrs_2_no_grad.shape},
+                    {TensorTypeVariant{mock_tensor_3_with_grad},
+                     tensor_attrs_3_with_grad.shape},
+                    {TensorTypeVariant{grad_tensor},
+                     tensor_attrs_3_with_grad.shape},
+                };
+            UnallocatedTensors correct =
+                UnallocatedTensors{correct_tensor_type_shapes,
+                                   {{mock_tensor_3_with_grad, grad_tensor}},
+                                   {}};
 
-            CHECK (result == correct);
+            CHECK(result == correct);
           }
         }
       }
       SUBCASE("Adam Attrs") {
-        OptimizerAttrs attrs = OptimizerAttrs{AdamOptimizerAttrs{/*alpha=*/0.001,
-          /*beta1=*/0.9,
-          /*beta2=*/0.999,
-          /*weight_decay=*/0.001,
-          /*alpha_t=*/0.001,
-          /*beta_t=*/0.9,
-          /*beta2_t=*/0.999,
-          /*epsilon=*/1e-8}};
+        OptimizerAttrs attrs =
+            OptimizerAttrs{AdamOptimizerAttrs{/*alpha=*/0.001,
+                                              /*beta1=*/0.9,
+                                              /*beta2=*/0.999,
+                                              /*weight_decay=*/0.001,
+                                              /*alpha_t=*/0.001,
+                                              /*beta_t=*/0.9,
+                                              /*beta2_t=*/0.999,
+                                              /*epsilon=*/1e-8}};
         SUBCASE("Empty") {
           AllocatedTensors empty = AllocatedTensors{{}, {}, {}};
           GradientTensorSource gradient_tensor_source;
           OptimizerTensorSource optimizer_tensour_source;
-          UnallocatedTensors result = generate_unallocated_tensors_with_optimizer(empty, 
-            tensor_attrs_mapping, gradient_tensor_source, optimizer_tensor_source, attrs);
-            
+          UnallocatedTensors result =
+              generate_unallocated_tensors_with_optimizer(
+                  empty,
+                  tensor_attrs_mapping,
+                  gradient_tensor_source,
+                  optimizer_tensor_source,
+                  attrs);
+
           GradientTensorSource mock_gradient_tensor_source;
-          gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor();
+          gradient_tensor_t grad_tensor =
+              mock_gradient_tensor_source.new_gradient_tensor();
           OptimizerTensorSource mock_optimizer_tensour_source;
-          optimizer_tensor_t optimizer_tensor_1 = mock_optimizer_tensour_source.new_optimizer_tensor();
-          optimizer_tensor_t optimizer_tensor_2 = mock_optimizer_tensour_source.new_optimizer_tensor();
-          
-          std::unordered_map<TensorTypeVariant, TensorShape> correct_tensor_type_shapes = {
-            {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape},
-            {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape},
-            {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape},
-            {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape},
-            {TensorTypeVariant{optimizer_tensor_1}, tensor_attrs_3_with_grad.shape},
-            {TensorTypeVariant{optimizer_tensor_2}, tensor_attrs_3_with_grad.shape},
-          };
-          UnallocatedTensors correct = UnallocatedTensors{
-            correct_tensor_type_shapes,
-            {{mock_tensor_3_with_grad, grad_tensor}},
-            {{mock_tensor_3_with_grad, {optimizer_tensor_1, optimizer_tensor_2}}}
-          };
+          optimizer_tensor_t optimizer_tensor_1 =
+              mock_optimizer_tensour_source.new_optimizer_tensor();
+          optimizer_tensor_t optimizer_tensor_2 =
+              mock_optimizer_tensour_source.new_optimizer_tensor();
+
+          std::unordered_map<TensorTypeVariant, TensorShape>
+              correct_tensor_type_shapes = {
+                  {TensorTypeVariant{mock_tensor_1},
+                   tensor_attrs_1_no_grad.shape},
+                  {TensorTypeVariant{mock_tensor_2},
+                   tensor_attrs_2_no_grad.shape},
+                  {TensorTypeVariant{mock_tensor_3_with_grad},
+                   tensor_attrs_3_with_grad.shape},
+                  {TensorTypeVariant{grad_tensor},
+                   tensor_attrs_3_with_grad.shape},
+                  {TensorTypeVariant{optimizer_tensor_1},
+                   tensor_attrs_3_with_grad.shape},
+                  {TensorTypeVariant{optimizer_tensor_2},
+                   tensor_attrs_3_with_grad.shape},
+              };
+          UnallocatedTensors correct =
+              UnallocatedTensors{correct_tensor_type_shapes,
+                                 {{mock_tensor_3_with_grad, grad_tensor}},
+                                 {{mock_tensor_3_with_grad,
+                                   {optimizer_tensor_1, optimizer_tensor_2}}}};
 
-          CHECK (result == correct);
+          CHECK(result == correct);
         }
         SUBCASE("Partially allocated") {
           OptimizerTensorSource optimizer_tensour_source;
-          optimizer_tensor_t optimizer_tensor_1 = optimizer_tensour_source.new_optimizer_tensor();
-          AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{{
-            {TensorTypeVariant{optimizer_tensor_1}, tensor_backing_3}
-          }, {}, {
-              {mock_tensor_3_with_grad, {optimizer_tensor_1}}
-          }};
+          optimizer_tensor_t optimizer_tensor_1 =
+              optimizer_tensour_source.new_optimizer_tensor();
+          AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{
+              {{TensorTypeVariant{optimizer_tensor_1}, tensor_backing_3}},
+              {},
+              {{mock_tensor_3_with_grad, {optimizer_tensor_1}}}};
           GradientTensorSource gradient_tensor_source;
-          UnallocatedTensors result = generate_unallocated_tensors_with_optimizer(allocated_optimizer_tensor, 
-            tensor_attrs_mapping, gradient_tensor_source, optimizer_tensor_source, attrs);
-            
+          UnallocatedTensors result =
+              generate_unallocated_tensors_with_optimizer(
+                  allocated_optimizer_tensor,
+                  tensor_attrs_mapping,
+                  gradient_tensor_source,
+                  optimizer_tensor_source,
+                  attrs);
+
           GradientTensorSource mock_gradient_tensor_source;
-          gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor();
+          gradient_tensor_t grad_tensor =
+              mock_gradient_tensor_source.new_gradient_tensor();
           OptimizerTensorSource mock_optimizer_tensour_source;
           optimizer_tensor_source.new_optimizer_tensor();
-          optimizer_tensor_t optimizer_tensor_2 = optimizer_tensour_source.new_optimizer_tensor();
-          
-          std::unordered_map<TensorTypeVariant, TensorShape> correct_tensor_type_shapes = {
-            {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape},
-            {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape},
-            {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape},
-            {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape},
-            {TensorTypeVariant{optimizer_tensor_2}, tensor_attrs_3_with_grad.shape},
-          };
+          optimizer_tensor_t optimizer_tensor_2 =
+              optimizer_tensour_source.new_optimizer_tensor();
+
+          std::unordered_map<TensorTypeVariant, TensorShape>
+              correct_tensor_type_shapes = {
+                  {TensorTypeVariant{mock_tensor_1},
+                   tensor_attrs_1_no_grad.shape},
+                  {TensorTypeVariant{mock_tensor_2},
+                   tensor_attrs_2_no_grad.shape},
+                  {TensorTypeVariant{mock_tensor_3_with_grad},
+                   tensor_attrs_3_with_grad.shape},
+                  {TensorTypeVariant{grad_tensor},
+                   tensor_attrs_3_with_grad.shape},
+                  {TensorTypeVariant{optimizer_tensor_2},
+                   tensor_attrs_3_with_grad.shape},
+              };
           UnallocatedTensors correct = UnallocatedTensors{
-            correct_tensor_type_shapes,
-            {{mock_tensor_3_with_grad, grad_tensor}},
-            {{mock_tensor_3_with_grad, {optimizer_tensor_2}}}
-          };
+              correct_tensor_type_shapes,
+              {{mock_tensor_3_with_grad, grad_tensor}},
+              {{mock_tensor_3_with_grad, {optimizer_tensor_2}}}};
 
-          CHECK (result == correct);
+          CHECK(result == correct);
         }
 
         SUBCASE("Fully allocated") {
           OptimizerTensorSource optimizer_tensour_source;
-          optimizer_tensor_t optimizer_tensor_1 = optimizer_tensour_source.new_optimizer_tensor();
-          optimizer_tensor_t optimizer_tensor_2 = optimizer_tensour_source.new_optimizer_tensor();
-          AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{{
-            {TensorTypeVariant{optimizer_tensor_1}, tensor_backing_3},
-            {TensorTypeVariant{optimizer_tensor_2}, tensor_backing_3}
-          }, {}, {
-              {mock_tensor_3_with_grad, {optimizer_tensor_1, optimizer_tensor_2}}
-          }};
+          optimizer_tensor_t optimizer_tensor_1 =
+              optimizer_tensour_source.new_optimizer_tensor();
+          optimizer_tensor_t optimizer_tensor_2 =
+              optimizer_tensour_source.new_optimizer_tensor();
+          AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{
+              {{TensorTypeVariant{optimizer_tensor_1}, tensor_backing_3},
+               {TensorTypeVariant{optimizer_tensor_2}, tensor_backing_3}},
+              {},
+              {{mock_tensor_3_with_grad,
+                {optimizer_tensor_1, optimizer_tensor_2}}}};
           GradientTensorSource gradient_tensor_source;
-          UnallocatedTensors result = generate_unallocated_tensors_with_optimizer(allocated_optimizer_tensor, 
-            tensor_attrs_mapping, gradient_tensor_source, optimizer_tensor_source, attrs);
-            
+          UnallocatedTensors result =
+              generate_unallocated_tensors_with_optimizer(
+                  allocated_optimizer_tensor,
+                  tensor_attrs_mapping,
+                  gradient_tensor_source,
+                  optimizer_tensor_source,
+                  attrs);
+
           GradientTensorSource mock_gradient_tensor_source;
-          gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor();
-          
-          std::unordered_map<TensorTypeVariant, TensorShape> correct_tensor_type_shapes = {
-            {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape},
-            {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape},
-            {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape},
-            {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape},
-          };
-          UnallocatedTensors correct = UnallocatedTensors{
-            correct_tensor_type_shapes,
-            {{mock_tensor_3_with_grad, grad_tensor}},
-            {}
-          };
+          gradient_tensor_t grad_tensor =
+              mock_gradient_tensor_source.new_gradient_tensor();
+
+          std::unordered_map<TensorTypeVariant, TensorShape>
+              correct_tensor_type_shapes = {
+                  {TensorTypeVariant{mock_tensor_1},
+                   tensor_attrs_1_no_grad.shape},
+                  {TensorTypeVariant{mock_tensor_2},
+                   tensor_attrs_2_no_grad.shape},
+                  {TensorTypeVariant{mock_tensor_3_with_grad},
+                   tensor_attrs_3_with_grad.shape},
+                  {TensorTypeVariant{grad_tensor},
+                   tensor_attrs_3_with_grad.shape},
+              };
+          UnallocatedTensors correct =
+              UnallocatedTensors{correct_tensor_type_shapes,
+                                 {{mock_tensor_3_with_grad, grad_tensor}},
+                                 {}};
 
-          CHECK (result == correct);
+          CHECK(result == correct);
         }
       }
     }

From b532c5023861ea8f0391c0aef4dc86e42cda0d22 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Thu, 13 Feb 2025 13:19:25 -0800
Subject: [PATCH 44/91] Pass allocated-unallocated tests

---
 lib/kernels/src/legion_dim.cc                 |   6 +
 .../local-execution/gradient_tensor_source.h  |   2 +
 .../local-execution/optimizer_tensor_source.h |   2 +
 .../src/gradient_tensor_source.cc             |   4 +
 .../src/local_tensor_backing.cc               |  11 +-
 .../src/local_training_backing.cc             |   3 +-
 .../src/optimizer_tensor_source.cc            |   4 +
 .../test/src/test_unallocated_tensors.cc      | 128 +++++++-----------
 lib/pcg/src/pcg/computation_graph.cc          |   2 +-
 9 files changed, 73 insertions(+), 89 deletions(-)

diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/legion_dim.cc
index 23875ad916..49b028f227 100644
--- a/lib/kernels/src/legion_dim.cc
+++ b/lib/kernels/src/legion_dim.cc
@@ -19,4 +19,10 @@ ff_dim_t legion_dim_from_ff_dim(legion_dim_t legion_dim,
                                   legion_dim.value.unwrap_nonnegative() - 1}};
 }
 
+ff_dim_t ff_dim_from_legion_dim(legion_dim_t legion_dim,
+                                nonnegative_int num_dimensions) {
+  return ff_dim_t{nonnegative_int{num_dimensions.unwrap_nonnegative() -
+                                  legion_dim.value.unwrap_nonnegative() - 1}};
+}
+
 } // namespace FlexFlow
diff --git a/lib/local-execution/include/local-execution/gradient_tensor_source.h b/lib/local-execution/include/local-execution/gradient_tensor_source.h
index e7d24d1ca5..d724859712 100644
--- a/lib/local-execution/include/local-execution/gradient_tensor_source.h
+++ b/lib/local-execution/include/local-execution/gradient_tensor_source.h
@@ -11,6 +11,8 @@ struct GradientTensorSource {
 
   gradient_tensor_t new_gradient_tensor();
 
+  void reset();
+
 private:
   static size_t next_available_gradient_tensor_id;
 };
diff --git a/lib/local-execution/include/local-execution/optimizer_tensor_source.h b/lib/local-execution/include/local-execution/optimizer_tensor_source.h
index 7a5057c84a..b2b3d94ba5 100644
--- a/lib/local-execution/include/local-execution/optimizer_tensor_source.h
+++ b/lib/local-execution/include/local-execution/optimizer_tensor_source.h
@@ -11,6 +11,8 @@ struct OptimizerTensorSource {
 
   optimizer_tensor_t new_optimizer_tensor();
 
+  void reset();
+
 private:
   static size_t next_available_optimizer_tensor_id;
 };
diff --git a/lib/local-execution/src/gradient_tensor_source.cc b/lib/local-execution/src/gradient_tensor_source.cc
index 28cec16ef9..7dcb947e89 100644
--- a/lib/local-execution/src/gradient_tensor_source.cc
+++ b/lib/local-execution/src/gradient_tensor_source.cc
@@ -11,4 +11,8 @@ gradient_tensor_t GradientTensorSource::new_gradient_tensor() {
       GradientTensorSource::next_available_gradient_tensor_id++};
 }
 
+void GradientTensorSource::reset() {
+  GradientTensorSource::next_available_gradient_tensor_id = 0;
+}
+
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/local_tensor_backing.cc b/lib/local-execution/src/local_tensor_backing.cc
index be84d77906..b5a0deaee4 100644
--- a/lib/local-execution/src/local_tensor_backing.cc
+++ b/lib/local-execution/src/local_tensor_backing.cc
@@ -135,7 +135,7 @@ UnallocatedTensors generate_unallocated_tensors(
       gradient_tensor_t gradient_tensor =
           gradient_tensor_source.new_gradient_tensor();
       tensor_type_shapes.insert(
-          {TensorTypeVariant{tensor_guid}, tensor_attrs.shape});
+          {TensorTypeVariant{gradient_tensor}, tensor_attrs.shape});
       gradient_mapping.insert({tensor_guid, gradient_tensor});
     }
   }
@@ -168,8 +168,7 @@ UnallocatedTensors generate_unallocated_tensors_with_optimizer(
        tensor_attrs_mapping) {
     tensor_guid_t tensor_guid = tensor_guid_attrs.first;
     TensorAttrs tensor_attrs = tensor_guid_attrs.second;
-    if (tensor_attrs.create_gradients == CreateGrad::YES &&
-        !allocated_tensors.optimizer_mapping.count(tensor_guid)) {
+    if (tensor_attrs.create_gradients == CreateGrad::YES) {
       std::vector<optimizer_tensor_t> optimizer_tensors;
 
       int num_optimizer_tensors_to_allocate =
@@ -178,6 +177,7 @@ UnallocatedTensors generate_unallocated_tensors_with_optimizer(
         num_optimizer_tensors_to_allocate -=
             allocated_tensors.optimizer_mapping.at(tensor_guid).size();
       }
+      std::cout << num_optimizer_tensors_to_allocate;
 
       for (int i = 0; i < num_optimizer_tensors_to_allocate; ++i) {
         optimizer_tensor_t optimizer_tensor =
@@ -186,7 +186,10 @@ UnallocatedTensors generate_unallocated_tensors_with_optimizer(
         tensor_type_shapes.insert(
             {TensorTypeVariant{optimizer_tensor}, tensor_attrs.shape});
       }
-      optimizer_mapping.insert({tensor_guid, optimizer_tensors});
+
+      if (num_optimizer_tensors_to_allocate > 0) {
+        optimizer_mapping.insert({tensor_guid, optimizer_tensors});
+      }
     }
   }
 
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index 35436a60fd..23db484d0b 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -101,8 +101,7 @@ std::optional<float> call_task_impl(TaskRegistry const &task_registry,
 
 std::optional<float>
     execute_forward(LocalTrainingBacking const &local_training_backing,
-                    layer_guid_t const &operator_node,
-                    Allocator &allocator) {
+                    layer_guid_t const &operator_node) {
   if (registry_contains_task_for_layer(local_training_backing.task_registry,
                                        operator_node,
                                        OpTaskType::FWD)) {
diff --git a/lib/local-execution/src/optimizer_tensor_source.cc b/lib/local-execution/src/optimizer_tensor_source.cc
index c241c7f4bd..a1a9a2927d 100644
--- a/lib/local-execution/src/optimizer_tensor_source.cc
+++ b/lib/local-execution/src/optimizer_tensor_source.cc
@@ -11,4 +11,8 @@ optimizer_tensor_t OptimizerTensorSource::new_optimizer_tensor() {
       OptimizerTensorSource::next_available_optimizer_tensor_id++};
 }
 
+void OptimizerTensorSource::reset() {
+  OptimizerTensorSource::next_available_optimizer_tensor_id = 0;
+}
+
 } // namespace FlexFlow
diff --git a/lib/local-execution/test/src/test_unallocated_tensors.cc b/lib/local-execution/test/src/test_unallocated_tensors.cc
index ddad7f4574..00f4c1c27c 100644
--- a/lib/local-execution/test/src/test_unallocated_tensors.cc
+++ b/lib/local-execution/test/src/test_unallocated_tensors.cc
@@ -17,8 +17,12 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("UnallocatedTensors") {
     MockTensorGuidSource tensor_guid_source;
+    GradientTensorSource gradient_tensor_source;
     OptimizerTensorSource optimizer_tensor_source;
 
+    gradient_tensor_source.reset();
+    optimizer_tensor_source.reset();
+
     Allocator allocator = create_local_cpu_memory_allocator();
 
     tensor_guid_t mock_tensor_1 = tensor_guid_source.new_mock_tensor_guid();
@@ -26,6 +30,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     tensor_guid_t mock_tensor_3_with_grad =
         tensor_guid_source.new_mock_tensor_guid();
 
+    gradient_tensor_t grad_tensor =
+        gradient_tensor_source.new_gradient_tensor();
+    optimizer_tensor_t optimizer_tensor_1 =
+        optimizer_tensor_source.new_optimizer_tensor();
+    optimizer_tensor_t optimizer_tensor_2 =
+        optimizer_tensor_source.new_optimizer_tensor();
+
     TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{
         TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 10_n}},
                     DataType::FLOAT},
@@ -61,13 +72,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("Without optimizer") {
       SUBCASE("AllocatedTensors is empty") {
         AllocatedTensors empty = AllocatedTensors{{}, {}, {}};
-        GradientTensorSource gradient_tensor_source;
+        gradient_tensor_source.reset();
         UnallocatedTensors result = generate_unallocated_tensors(
             empty, tensor_attrs_mapping, gradient_tensor_source);
 
-        GradientTensorSource mock_gradient_tensor_source;
-        gradient_tensor_t grad_tensor =
-            mock_gradient_tensor_source.new_gradient_tensor();
         std::unordered_map<TensorTypeVariant, TensorShape>
             correct_tensor_type_shapes = {
                 {TensorTypeVariant{mock_tensor_1},
@@ -93,15 +101,12 @@ TEST_SUITE(FF_TEST_SUITE) {
             },
             {},
             {}};
-        GradientTensorSource gradient_tensor_source;
+
+        gradient_tensor_source.reset();
         UnallocatedTensors result =
             generate_unallocated_tensors(allocated_forward_tensors,
                                          tensor_attrs_mapping,
                                          gradient_tensor_source);
-
-        GradientTensorSource mock_gradient_tensor_source;
-        gradient_tensor_t grad_tensor =
-            mock_gradient_tensor_source.new_gradient_tensor();
         std::unordered_map<TensorTypeVariant, TensorShape>
             correct_tensor_type_shapes = {
                 {TensorTypeVariant{mock_tensor_2},
@@ -127,15 +132,13 @@ TEST_SUITE(FF_TEST_SUITE) {
             },
             {},
             {}};
-        GradientTensorSource gradient_tensor_source;
+
+        gradient_tensor_source.reset();
         UnallocatedTensors result =
             generate_unallocated_tensors(allocated_forward_tensors,
                                          tensor_attrs_mapping,
                                          gradient_tensor_source);
 
-        GradientTensorSource mock_gradient_tensor_source;
-        gradient_tensor_t grad_tensor =
-            mock_gradient_tensor_source.new_gradient_tensor();
         std::unordered_map<TensorTypeVariant, TensorShape>
             correct_tensor_type_shapes = {
                 {TensorTypeVariant{grad_tensor},
@@ -149,9 +152,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
 
       SUBCASE("AllocatedTensors contains only gradient tensor") {
-        GradientTensorSource gradient_tensor_source;
-        gradient_tensor_t grad_tensor =
-            gradient_tensor_source.new_gradient_tensor();
+
         AllocatedTensors allocated_forward_tensors = AllocatedTensors{
             {
                 {TensorTypeVariant{grad_tensor}, tensor_backing_3},
@@ -178,9 +179,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
 
       SUBCASE("AllocatedTensors contains mixture") {
-        GradientTensorSource gradient_tensor_source;
-        gradient_tensor_t grad_tensor =
-            gradient_tensor_source.new_gradient_tensor();
+
         AllocatedTensors allocated_forward_tensors = AllocatedTensors{
             {
                 {TensorTypeVariant{mock_tensor_1}, tensor_backing_1},
@@ -206,9 +205,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
 
       SUBCASE("Fully AllocatedTensors") {
-        GradientTensorSource gradient_tensor_source;
-        gradient_tensor_t grad_tensor =
-            gradient_tensor_source.new_gradient_tensor();
+
         AllocatedTensors allocated_forward_tensors = AllocatedTensors{
             {
                 {TensorTypeVariant{mock_tensor_1}, tensor_backing_1},
@@ -235,8 +232,8 @@ TEST_SUITE(FF_TEST_SUITE) {
           OptimizerAttrs attrs =
               OptimizerAttrs{SGDOptimizerAttrs{0.0, momentum, false, 0.0}};
           AllocatedTensors empty = AllocatedTensors{{}, {}, {}};
-          GradientTensorSource gradient_tensor_source;
-          OptimizerTensorSource optimizer_tensour_source;
+
+          gradient_tensor_source.reset();
           UnallocatedTensors result =
               generate_unallocated_tensors_with_optimizer(
                   empty,
@@ -245,9 +242,9 @@ TEST_SUITE(FF_TEST_SUITE) {
                   optimizer_tensor_source,
                   attrs);
 
-          GradientTensorSource mock_gradient_tensor_source;
+          gradient_tensor_source.reset();
           UnallocatedTensors correct = generate_unallocated_tensors(
-              empty, tensor_attrs_mapping, mock_gradient_tensor_source);
+              empty, tensor_attrs_mapping, gradient_tensor_source);
           CHECK(result == correct);
         }
         SUBCASE("with momentum") {
@@ -257,8 +254,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
           SUBCASE("unallocated") {
             AllocatedTensors empty = AllocatedTensors{{}, {}, {}};
-            GradientTensorSource gradient_tensor_source;
-            OptimizerTensorSource optimizer_tensour_source;
+
+            gradient_tensor_source.reset();
+            optimizer_tensor_source.reset();
             UnallocatedTensors result =
                 generate_unallocated_tensors_with_optimizer(
                     empty,
@@ -267,13 +265,6 @@ TEST_SUITE(FF_TEST_SUITE) {
                     optimizer_tensor_source,
                     attrs);
 
-            GradientTensorSource mock_gradient_tensor_source;
-            gradient_tensor_t grad_tensor =
-                mock_gradient_tensor_source.new_gradient_tensor();
-            OptimizerTensorSource mock_optimizer_tensour_source;
-            optimizer_tensor_t optimizer_tensor =
-                mock_optimizer_tensour_source.new_optimizer_tensor();
-
             std::unordered_map<TensorTypeVariant, TensorShape>
                 correct_tensor_type_shapes = {
                     {TensorTypeVariant{mock_tensor_1},
@@ -284,26 +275,25 @@ TEST_SUITE(FF_TEST_SUITE) {
                      tensor_attrs_3_with_grad.shape},
                     {TensorTypeVariant{grad_tensor},
                      tensor_attrs_3_with_grad.shape},
-                    {TensorTypeVariant{optimizer_tensor},
+                    {TensorTypeVariant{optimizer_tensor_1},
                      tensor_attrs_3_with_grad.shape},
                 };
             UnallocatedTensors correct = UnallocatedTensors{
                 correct_tensor_type_shapes,
                 {{mock_tensor_3_with_grad, grad_tensor}},
-                {{mock_tensor_3_with_grad, {optimizer_tensor}}}};
+                {{mock_tensor_3_with_grad, {optimizer_tensor_1}}}};
 
             CHECK(result == correct);
           }
 
           SUBCASE("allocated") {
-            OptimizerTensorSource optimizer_tensour_source;
-            optimizer_tensor_t optimizer_tensor =
-                optimizer_tensour_source.new_optimizer_tensor();
+
             AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{
-                {{TensorTypeVariant{optimizer_tensor}, tensor_backing_3}},
+                {{TensorTypeVariant{optimizer_tensor_1}, tensor_backing_3}},
                 {},
-                {{mock_tensor_3_with_grad, {optimizer_tensor}}}};
-            GradientTensorSource gradient_tensor_source;
+                {{mock_tensor_3_with_grad, {optimizer_tensor_1}}}};
+
+            gradient_tensor_source.reset();
             UnallocatedTensors result =
                 generate_unallocated_tensors_with_optimizer(
                     allocated_optimizer_tensor,
@@ -312,10 +302,6 @@ TEST_SUITE(FF_TEST_SUITE) {
                     optimizer_tensor_source,
                     attrs);
 
-            GradientTensorSource mock_gradient_tensor_source;
-            gradient_tensor_t grad_tensor =
-                mock_gradient_tensor_source.new_gradient_tensor();
-
             std::unordered_map<TensorTypeVariant, TensorShape>
                 correct_tensor_type_shapes = {
                     {TensorTypeVariant{mock_tensor_1},
@@ -348,8 +334,9 @@ TEST_SUITE(FF_TEST_SUITE) {
                                               /*epsilon=*/1e-8}};
         SUBCASE("Empty") {
           AllocatedTensors empty = AllocatedTensors{{}, {}, {}};
-          GradientTensorSource gradient_tensor_source;
-          OptimizerTensorSource optimizer_tensour_source;
+
+          gradient_tensor_source.reset();
+          optimizer_tensor_source.reset();
           UnallocatedTensors result =
               generate_unallocated_tensors_with_optimizer(
                   empty,
@@ -358,15 +345,6 @@ TEST_SUITE(FF_TEST_SUITE) {
                   optimizer_tensor_source,
                   attrs);
 
-          GradientTensorSource mock_gradient_tensor_source;
-          gradient_tensor_t grad_tensor =
-              mock_gradient_tensor_source.new_gradient_tensor();
-          OptimizerTensorSource mock_optimizer_tensour_source;
-          optimizer_tensor_t optimizer_tensor_1 =
-              mock_optimizer_tensour_source.new_optimizer_tensor();
-          optimizer_tensor_t optimizer_tensor_2 =
-              mock_optimizer_tensour_source.new_optimizer_tensor();
-
           std::unordered_map<TensorTypeVariant, TensorShape>
               correct_tensor_type_shapes = {
                   {TensorTypeVariant{mock_tensor_1},
@@ -391,14 +369,16 @@ TEST_SUITE(FF_TEST_SUITE) {
           CHECK(result == correct);
         }
         SUBCASE("Partially allocated") {
-          OptimizerTensorSource optimizer_tensour_source;
-          optimizer_tensor_t optimizer_tensor_1 =
-              optimizer_tensour_source.new_optimizer_tensor();
+          gradient_tensor_source.reset();
+          optimizer_tensor_source.reset();
+          optimizer_tensor_t optimizer_tensor_pre_allocated =
+              optimizer_tensor_source.new_optimizer_tensor();
           AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{
-              {{TensorTypeVariant{optimizer_tensor_1}, tensor_backing_3}},
+              {{TensorTypeVariant{optimizer_tensor_pre_allocated},
+                tensor_backing_3}},
               {},
-              {{mock_tensor_3_with_grad, {optimizer_tensor_1}}}};
-          GradientTensorSource gradient_tensor_source;
+              {{mock_tensor_3_with_grad, {optimizer_tensor_pre_allocated}}}};
+
           UnallocatedTensors result =
               generate_unallocated_tensors_with_optimizer(
                   allocated_optimizer_tensor,
@@ -407,14 +387,6 @@ TEST_SUITE(FF_TEST_SUITE) {
                   optimizer_tensor_source,
                   attrs);
 
-          GradientTensorSource mock_gradient_tensor_source;
-          gradient_tensor_t grad_tensor =
-              mock_gradient_tensor_source.new_gradient_tensor();
-          OptimizerTensorSource mock_optimizer_tensour_source;
-          optimizer_tensor_source.new_optimizer_tensor();
-          optimizer_tensor_t optimizer_tensor_2 =
-              optimizer_tensour_source.new_optimizer_tensor();
-
           std::unordered_map<TensorTypeVariant, TensorShape>
               correct_tensor_type_shapes = {
                   {TensorTypeVariant{mock_tensor_1},
@@ -437,18 +409,14 @@ TEST_SUITE(FF_TEST_SUITE) {
         }
 
         SUBCASE("Fully allocated") {
-          OptimizerTensorSource optimizer_tensour_source;
-          optimizer_tensor_t optimizer_tensor_1 =
-              optimizer_tensour_source.new_optimizer_tensor();
-          optimizer_tensor_t optimizer_tensor_2 =
-              optimizer_tensour_source.new_optimizer_tensor();
           AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{
               {{TensorTypeVariant{optimizer_tensor_1}, tensor_backing_3},
                {TensorTypeVariant{optimizer_tensor_2}, tensor_backing_3}},
               {},
               {{mock_tensor_3_with_grad,
                 {optimizer_tensor_1, optimizer_tensor_2}}}};
-          GradientTensorSource gradient_tensor_source;
+
+          gradient_tensor_source.reset();
           UnallocatedTensors result =
               generate_unallocated_tensors_with_optimizer(
                   allocated_optimizer_tensor,
@@ -457,10 +425,6 @@ TEST_SUITE(FF_TEST_SUITE) {
                   optimizer_tensor_source,
                   attrs);
 
-          GradientTensorSource mock_gradient_tensor_source;
-          gradient_tensor_t grad_tensor =
-              mock_gradient_tensor_source.new_gradient_tensor();
-
           std::unordered_map<TensorTypeVariant, TensorShape>
               correct_tensor_type_shapes = {
                   {TensorTypeVariant{mock_tensor_1},
diff --git a/lib/pcg/src/pcg/computation_graph.cc b/lib/pcg/src/pcg/computation_graph.cc
index 728a150c2a..1cb7bb6d2a 100644
--- a/lib/pcg/src/pcg/computation_graph.cc
+++ b/lib/pcg/src/pcg/computation_graph.cc
@@ -85,7 +85,7 @@ std::vector<tensor_guid_t> get_incoming_tensors(ComputationGraph const &cg,
 }
 
 std::vector<TensorShape> get_incoming_input_shapes(ComputationGraph const &cg,
-                                                   layer_guid_t n) {
+                                                   layer_guid_t const &n) {
   return transform(get_incoming_inputs(cg, n), [&](tensor_guid_t const &t) {
     return get_tensor_attrs(cg, t).shape;
   });

From f28e5c2fbfaa8065dc1ea8d33a0b669e9b763ffe Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Thu, 13 Feb 2025 14:34:02 -0800
Subject: [PATCH 45/91] Update task registry tests

---
 .../include/local-execution/task_registry.h   |  3 +-
 .../src/local_training_backing.cc             |  6 +-
 lib/local-execution/src/task_registry.cc      | 15 ++--
 lib/local-execution/test/CMakeLists.txt       |  2 +
 .../test/src/test_task_registry.cc            | 77 ++++++++++++-------
 lib/pcg/include/pcg/computation_graph.h       |  3 +
 lib/pcg/src/pcg/computation_graph.cc          |  9 +++
 7 files changed, 77 insertions(+), 38 deletions(-)

diff --git a/lib/local-execution/include/local-execution/task_registry.h b/lib/local-execution/include/local-execution/task_registry.h
index 56e98ba8da..eb3e0859d0 100644
--- a/lib/local-execution/include/local-execution/task_registry.h
+++ b/lib/local-execution/include/local-execution/task_registry.h
@@ -9,7 +9,8 @@
 
 namespace FlexFlow {
 
-TaskRegistry construct_task_registry(ComputationGraph const &);
+TaskRegistry construct_task_registry(
+    std::unordered_map<layer_guid_t, LayerAttrs> const &);
 
 bool registry_contains_task_for_layer(TaskRegistry const &,
                                       layer_guid_t const &,
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index 23db484d0b..f09234b920 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -20,7 +20,8 @@ LocalTrainingBacking::LocalTrainingBacking(
     ComputationGraph const &computation_graph,
     RuntimeArgConfig const &runtime_arg_config)
     : computation_graph(computation_graph),
-      task_registry(construct_task_registry(computation_graph)),
+      task_registry(construct_task_registry(
+          get_layer_attrs_mapping(this->computation_graph))),
       local_tensor_backing(allocated_tensors,
                            generate_unallocated_tensors(
                                allocated_tensors,
@@ -39,7 +40,8 @@ LocalTrainingBacking::LocalTrainingBacking(
     RuntimeArgConfig const &runtime_arg_config,
     OptimizerAttrs const &optimizer_attrs)
     : computation_graph(computation_graph),
-      task_registry(construct_task_registry(computation_graph)),
+      task_registry(construct_task_registry(
+          get_layer_attrs_mapping(this->computation_graph))),
       local_tensor_backing(allocated_tensors,
                            generate_unallocated_tensors_with_optimizer(
                                allocated_tensors,
diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc
index f33aef8460..487bd4420e 100644
--- a/lib/local-execution/src/task_registry.cc
+++ b/lib/local-execution/src/task_registry.cc
@@ -4,19 +4,22 @@
 
 namespace FlexFlow {
 
-TaskRegistry construct_task_registry(ComputationGraph const &cg) {
+TaskRegistry construct_task_registry(
+    std::unordered_map<layer_guid_t, LayerAttrs> const &layer_attrs_mapping) {
   std::unordered_map<layer_guid_t, std::optional<task_id_t>> init_task_ids;
   std::unordered_map<layer_guid_t, std::optional<task_id_t>> fwd_task_ids;
   std::unordered_map<layer_guid_t, std::optional<task_id_t>> bwd_task_ids;
 
   std::unordered_map<task_id_t, TaskSignatureAndImpl> task_mapping;
 
-  for (layer_guid_t const &node : topological_ordering(cg)) {
+  for (std::pair<layer_guid_t, LayerAttrs> const &layer_attrs :
+       layer_attrs_mapping) {
+    layer_guid_t node = layer_attrs.first;
     init_task_ids.insert({node, std::nullopt});
     fwd_task_ids.insert({node, std::nullopt});
     bwd_task_ids.insert({node, std::nullopt});
 
-    ComputationGraphOpAttrs attrs = get_layer_attrs(cg, node).attrs;
+    ComputationGraphOpAttrs attrs = layer_attrs.second.attrs;
     std::vector<task_id_t> task_ids = get_task_ids(attrs);
 
     for (task_id_t const &task_id : task_ids) {
@@ -29,13 +32,13 @@ TaskRegistry construct_task_registry(ComputationGraph const &cg) {
           break;
         case OpTaskType::FWD:
           assert(is_invocation_valid(task_signature_impl.task_signature,
-                                     init(attrs)));
+                                     forward(attrs)));
           fwd_task_ids[node] = task_id;
           break;
         case OpTaskType::BWD:
           assert(is_invocation_valid(task_signature_impl.task_signature,
-                                     init(attrs)));
-          fwd_task_ids[node] = task_id;
+                                     backward(attrs)));
+          bwd_task_ids[node] = task_id;
           break;
         default:
           throw mk_runtime_error(
diff --git a/lib/local-execution/test/CMakeLists.txt b/lib/local-execution/test/CMakeLists.txt
index 6e3d890176..fc647cff9b 100644
--- a/lib/local-execution/test/CMakeLists.txt
+++ b/lib/local-execution/test/CMakeLists.txt
@@ -4,7 +4,9 @@ ff_add_test_executable(
   SRC_PATTERNS
     src/test_allocated_tensors.cc
     src/test_unallocated_tensors.cc
+    src/test_task_registry.cc
     src/test_utils.cc
+    src/test_local_task_arg_accessor.cc
   PRIVATE_INCLUDE 
     src/
   DEPS
diff --git a/lib/local-execution/test/src/test_task_registry.cc b/lib/local-execution/test/src/test_task_registry.cc
index 16325d4763..20b4f11a2a 100644
--- a/lib/local-execution/test/src/test_task_registry.cc
+++ b/lib/local-execution/test/src/test_task_registry.cc
@@ -10,7 +10,6 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Task Registry") {
-    TaskRegistry task_registry = empty_task_registry();
 
     layer_guid_t layer_guid = layer_guid_t{Node{0}};
     nonnegative_int embed_dim = 32_n;
@@ -28,7 +27,8 @@ TEST_SUITE(FF_TEST_SUITE) {
         }};
 
     SUBCASE("register single layer") {
-      register_tasks_for_layer(task_registry, layer_guid, attrs);
+      TaskRegistry task_registry = construct_task_registry(
+          {{layer_guid, LayerAttrs{attrs, std::nullopt}}});
 
       TaskRegistry correct_task_registry = [&] {
         std::unordered_map<layer_guid_t, std::optional<task_id_t>>
@@ -53,8 +53,10 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("multiple layers same task") {
       layer_guid_t other_layer_guid = layer_guid_t{Node{1}};
-      register_tasks_for_layer(task_registry, layer_guid, attrs);
-      register_tasks_for_layer(task_registry, other_layer_guid, attrs);
+      TaskRegistry task_registry = construct_task_registry({
+          {layer_guid, LayerAttrs{attrs, std::nullopt}},
+          {other_layer_guid, LayerAttrs{attrs, std::nullopt}},
+      });
 
       SUBCASE("layer to task ids") {
         std::unordered_map<layer_guid_t, std::optional<task_id_t>> correct = {
@@ -64,6 +66,39 @@ TEST_SUITE(FF_TEST_SUITE) {
         CHECK(correct == task_registry.init_task_ids);
       }
 
+      SUBCASE("task to signature+impl mapping") {
+        std::unordered_map<task_id_t, TaskSignatureAndImpl>
+            correct_task_mapping = {
+                {task_id_t::ATTENTION_INIT_TASK_ID,
+                 get_task_sig_impl(task_id_t::ATTENTION_INIT_TASK_ID)},
+                {task_id_t::ATTENTION_FWD_TASK_ID,
+                 get_task_sig_impl(task_id_t::ATTENTION_FWD_TASK_ID)},
+                {task_id_t::ATTENTION_BWD_TASK_ID,
+                 get_task_sig_impl(task_id_t::ATTENTION_BWD_TASK_ID)}};
+        CHECK(correct_task_mapping == task_registry.task_mapping);
+      }
+    }
+    SUBCASE("different attrs, still same task fn mapping") {
+      layer_guid_t layer_1 = layer_guid_t{Node{1}};
+      nonnegative_int embed_dim = 100_n;
+      layer_guid_t layer_2 = layer_guid_t{Node{2}};
+      ComputationGraphOpAttrs other_attrs =
+          ComputationGraphOpAttrs{MultiHeadAttentionAttrs{
+              /*embed_dim=*/embed_dim,
+              /*num_heads=*/num_heads,
+              /*kdim=*/embed_dim,
+              /*vdim=*/embed_dim,
+              /*dropout=*/0.0,
+              /*bias=*/true,
+              /*add_bias_kv=*/false,
+              /*add_zero_attn=*/false,
+          }};
+      TaskRegistry task_registry = construct_task_registry({
+          {layer_guid, LayerAttrs{attrs, std::nullopt}},
+          {layer_1, LayerAttrs{attrs, std::nullopt}},
+          {layer_2, LayerAttrs{other_attrs, std::nullopt}},
+      });
+
       std::unordered_map<task_id_t, TaskSignatureAndImpl> correct_task_mapping =
           {{task_id_t::ATTENTION_INIT_TASK_ID,
             get_task_sig_impl(task_id_t::ATTENTION_INIT_TASK_ID)},
@@ -71,31 +106,11 @@ TEST_SUITE(FF_TEST_SUITE) {
             get_task_sig_impl(task_id_t::ATTENTION_FWD_TASK_ID)},
            {task_id_t::ATTENTION_BWD_TASK_ID,
             get_task_sig_impl(task_id_t::ATTENTION_BWD_TASK_ID)}};
-      SUBCASE("task to signature+impl mapping") {
-        CHECK(correct_task_mapping == task_registry.task_mapping);
-      }
-      SUBCASE("different attrs, still same task fn mapping") {
-        nonnegative_int embed_dim = 100_n;
-        layer_guid_t layer_3 = layer_guid_t{Node{3}};
-        ComputationGraphOpAttrs other_attrs =
-            ComputationGraphOpAttrs{MultiHeadAttentionAttrs{
-                /*embed_dim=*/embed_dim,
-                /*num_heads=*/num_heads,
-                /*kdim=*/embed_dim,
-                /*vdim=*/embed_dim,
-                /*dropout=*/0.0,
-                /*bias=*/true,
-                /*add_bias_kv=*/false,
-                /*add_zero_attn=*/false,
-            }};
-        register_tasks_for_layer(task_registry, layer_3, other_attrs);
 
-        CHECK(correct_task_mapping == task_registry.task_mapping);
-      }
+      CHECK(correct_task_mapping == task_registry.task_mapping);
     }
 
     SUBCASE("equality") {
-      TaskRegistry other_task_registry = empty_task_registry();
       SUBCASE("different attrs is still equal") {
         nonnegative_int embed_dim = 100_n;
         ComputationGraphOpAttrs other_attrs =
@@ -110,16 +125,20 @@ TEST_SUITE(FF_TEST_SUITE) {
                 /*add_zero_attn=*/false,
             }};
 
-        register_tasks_for_layer(task_registry, layer_guid, attrs);
-        register_tasks_for_layer(other_task_registry, layer_guid, other_attrs);
+        TaskRegistry task_registry = construct_task_registry(
+            {{layer_guid, LayerAttrs{attrs, std::nullopt}}});
+        TaskRegistry other_task_registry = construct_task_registry(
+            {{layer_guid, LayerAttrs{other_attrs, std::nullopt}}});
 
         CHECK(task_registry == other_task_registry);
       }
 
       SUBCASE("different layer_guid is not equal") {
-        register_tasks_for_layer(task_registry, layer_guid, attrs);
+        TaskRegistry task_registry = construct_task_registry(
+            {{layer_guid, LayerAttrs{attrs, std::nullopt}}});
         layer_guid_t other_layer_guid = layer_guid_t{Node{1}};
-        register_tasks_for_layer(other_task_registry, other_layer_guid, attrs);
+        TaskRegistry other_task_registry = construct_task_registry(
+            {{other_layer_guid, LayerAttrs{attrs, std::nullopt}}});
 
         CHECK(task_registry != other_task_registry);
       }
diff --git a/lib/pcg/include/pcg/computation_graph.h b/lib/pcg/include/pcg/computation_graph.h
index 589496e61b..e9ee69134d 100644
--- a/lib/pcg/include/pcg/computation_graph.h
+++ b/lib/pcg/include/pcg/computation_graph.h
@@ -57,6 +57,9 @@ std::unordered_set<layer_guid_t>
 
 LayerAttrs get_layer_attrs(ComputationGraph const &cg, layer_guid_t const &n);
 
+std::unordered_map<layer_guid_t, LayerAttrs>
+    get_layer_attrs_mapping(ComputationGraph const &cg);
+
 layer_guid_t get_layer_by_name(ComputationGraph const &cg,
                                std::string const &name);
 
diff --git a/lib/pcg/src/pcg/computation_graph.cc b/lib/pcg/src/pcg/computation_graph.cc
index 1cb7bb6d2a..b932910499 100644
--- a/lib/pcg/src/pcg/computation_graph.cc
+++ b/lib/pcg/src/pcg/computation_graph.cc
@@ -190,6 +190,15 @@ LayerAttrs get_layer_attrs(ComputationGraph const &cg, layer_guid_t const &n) {
   return cg.raw_graph.at(n.raw_node);
 }
 
+std::unordered_map<layer_guid_t, LayerAttrs>
+    get_layer_attrs_mapping(ComputationGraph const &cg) {
+  std::unordered_map<layer_guid_t, LayerAttrs> layer_attrs_mapping;
+  for (layer_guid_t const &layer_guid : get_layers(cg)) {
+    layer_attrs_mapping.insert({layer_guid, get_layer_attrs(cg, layer_guid)});
+  }
+  return layer_attrs_mapping;
+}
+
 layer_guid_t get_layer_by_name(ComputationGraph const &cg,
                                std::string const &name) {
   std::unordered_set<layer_guid_t> found =

From 9c16d7682543092fdfa67dc104066779fc32442b Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 19 Feb 2025 11:31:03 -0800
Subject: [PATCH 46/91] feat: intial implementation of realm-backend

---
 .../include/realm-backend/allocated_tensors.h |  30 ++
 .../allocated_tensors.struct.toml             |  32 ++
 .../realm-backend/model_training_instance.h   |  19 +-
 .../realm-backend/realm_args_backing.h        |  34 +-
 .../realm_task_argument_accessor.h            |  19 +-
 .../realm-backend/realm_tensor_backing.h      |  82 +--
 .../realm-backend/realm_training_backing.h    |  57 ++-
 .../include/realm-backend/task_result.h       |   1 +
 .../unallocated_tensors.struct.toml           |  31 ++
 lib/realm-backend/src/allocated_tensors.cc    | 141 ++++++
 .../src/model_training_instance.cc            |  81 ++-
 lib/realm-backend/src/realm_args_backing.cc   |  84 ++--
 .../src/realm_tensor_backing copy.cc          | 142 ++++++
 lib/realm-backend/src/realm_tensor_backing.cc | 286 +++++++----
 .../src/realm_training_backing.cc             | 471 +++++++++++-------
 15 files changed, 1042 insertions(+), 468 deletions(-)
 create mode 100644 lib/realm-backend/include/realm-backend/allocated_tensors.h
 create mode 100644 lib/realm-backend/include/realm-backend/allocated_tensors.struct.toml
 create mode 100644 lib/realm-backend/include/realm-backend/unallocated_tensors.struct.toml
 create mode 100644 lib/realm-backend/src/allocated_tensors.cc
 create mode 100644 lib/realm-backend/src/realm_tensor_backing copy.cc

diff --git a/lib/realm-backend/include/realm-backend/allocated_tensors.h b/lib/realm-backend/include/realm-backend/allocated_tensors.h
new file mode 100644
index 0000000000..8effd06954
--- /dev/null
+++ b/lib/realm-backend/include/realm-backend/allocated_tensors.h
@@ -0,0 +1,30 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_ALLOCATED_TENSORS_H
+#define _FLEXFLOW_LOCAL_EXECUTION_ALLOCATED_TENSORS_H
+
+#include "realm-backend/allocated_tensors.dtg.h"
+#include "pcg/computation_graph.h"
+
+namespace FlexFlow {
+
+bool are_allocated_forward_tensors_valid(
+    AllocatedTensors const &,
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &);
+bool are_allocated_gradient_tensors_valid(
+    AllocatedTensors const &,
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &);
+bool are_allocated_optimizer_tensors_valid(
+    AllocatedTensors const &,
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &);
+
+bool are_allocated_tensors_valid(
+    AllocatedTensors const &,
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &);
+
+bool is_allocated_tensor_backing_valid(
+    TensorTypeVariant const &,
+    std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> const &,
+    ArrayShape const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-backend/include/realm-backend/allocated_tensors.struct.toml b/lib/realm-backend/include/realm-backend/allocated_tensors.struct.toml
new file mode 100644
index 0000000000..d459027e5d
--- /dev/null
+++ b/lib/realm-backend/include/realm-backend/allocated_tensors.struct.toml
@@ -0,0 +1,32 @@
+namespace = "FlexFlow"
+name = "AllocatedTensors"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+]
+
+includes = [
+  "task-spec/tensor_type_t.dtg.h",
+  "kernels/accessor.h",
+  "realm-backend/realm_allocator.h"
+]
+
+src_includes = [
+  "utils/hash/unordered_map.h",
+  "utils/fmt/unordered_map.h",
+  "utils/hash/vector.h",
+  "utils/fmt/vector.h"
+]
+
+[[fields]]
+name = "tensor_type_backings"
+type = "std::unordered_map<::FlexFlow::TensorTypeVariant, std::pair<::FlexFlow::RealmRegion,::FlexFlow::TensorShape>>"
+
+[[fields]]
+name = "gradient_mapping"
+type = "std::unordered_map<::FlexFlow::tensor_guid_t, ::FlexFlow::gradient_tensor_t>"
+
+[[fields]]
+name = "optimizer_mapping"
+type = "std::unordered_map<::FlexFlow::tensor_guid_t, std::vector<::FlexFlow::optimizer_tensor_t>>"
diff --git a/lib/realm-backend/include/realm-backend/model_training_instance.h b/lib/realm-backend/include/realm-backend/model_training_instance.h
index aa3876fb0d..a35cada2d2 100644
--- a/lib/realm-backend/include/realm-backend/model_training_instance.h
+++ b/lib/realm-backend/include/realm-backend/model_training_instance.h
@@ -12,26 +12,23 @@ using PerLayerElapsedTime =
     std::unordered_map<layer_guid_t, std::optional<float>>;
 
 struct ModelTrainingInstance {
-  ModelTrainingInstance(ComputationGraph const &,
-                        RuntimeArgConfig const &,
-                        LossAttrs const &,
+  ModelTrainingInstance(RealmTrainingBacking const &,
                         tensor_guid_t const &logit_tensor,
-                        loss_tensor_t const &label_tensor,
+                        TensorShape const &label_tensor_shape,
+                        LossAttrs const &,
                         OptimizerAttrs const &);
 
-  void execute_init();
-  PerLayerElapsedTime execute_forward();
-  PerLayerElapsedTime execute_backward();
-  void execute_update();
-
-  ComputationGraph computation_graph;
   RealmTrainingBacking training_backing;
-  LossAttrs loss_attrs;
   tensor_guid_t logit_tensor;
   loss_tensor_t label_tensor;
+  LossAttrs loss_attrs;
   OptimizerAttrs optimizer_attrs;
 };
 
+PerLayerElapsedTime forward(ModelTrainingInstance &);
+PerLayerElapsedTime backward(ModelTrainingInstance &);
+void update(ModelTrainingInstance &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/realm-backend/include/realm-backend/realm_args_backing.h b/lib/realm-backend/include/realm-backend/realm_args_backing.h
index 88db880fcb..75f954c0ad 100644
--- a/lib/realm-backend/include/realm-backend/realm_args_backing.h
+++ b/lib/realm-backend/include/realm-backend/realm_args_backing.h
@@ -1,38 +1,38 @@
 #ifndef _FLEXFLOW_REALM_BACKEND_REALM_ARGS_BACKING_H
 #define _FLEXFLOW_REALM_BACKEND_REALM_ARGS_BACKING_H
 
-#include "local-execution/op_task_invocation.h"
-#include "local-execution/per_device_op_state.h"
-#include "local-execution/runtime_arg_config.h"
-#include "local-execution/task_invocation.dtg.h"
 #include "pcg/computation_graph.h"
 #include "pcg/layer_guid_t.dtg.h"
 #include "realm-backend/realm_task_argument_accessor.h"
 #include "realm-backend/task_result.h"
+#include "task-spec/op_task_invocation.h"
+#include "task-spec/per_device_op_state.h"
+#include "task-spec/runtime_arg_config.h"
+#include "task-spec/task_invocation.dtg.h"
 
 namespace FlexFlow {
 
 struct RealmArgsBacking {
-  RealmArgsBacking(RuntimeArgConfig const &);
-
-public:
-  void add_per_device_op_state(layer_guid_t const &,
-                               Future<DeviceSpecificDeviceStates> &&);
-
-  ArgSlotsBacking construct_arg_slots_backing(TaskBinding const &) const;
-
-  ConcreteArgSpec lower_to_concrete_arg_spec(RuntimeArgRefSpec const &) const;
-  ConcreteArgSpec lower_to_concrete_arg_spec(OpArgRefSpec const &,
-                                             ComputationGraph const &,
-                                             layer_guid_t const &) const;
+  RealmArgsBacking(RuntimeArgConfig const &,
+     std::unordered_map<layer_guid_t, DeviceSpecificDeviceStates> const &);
 
 public:
   // arguments
+  RuntimeArgConfig runtime_arg_config;
   std::unordered_map<layer_guid_t, DeviceSpecificDeviceStates>
       per_device_op_states;
-  RuntimeArgConfig runtime_arg_config;
 };
 
+RealmArgsBacking
+make_args_backing_with_empty_device_states(RuntimeArgConfig const &);
+
+std::optional<DeviceSpecificDeviceStates>
+get_per_device_op_state_if_exists(RealmArgsBacking const &,
+                                  layer_guid_t const &);
+
+ArgSlotsBacking construct_arg_slots_backing(TaskBinding const &,
+                                            RuntimeArgConfig const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h b/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h
index 5c7ecafd0f..ce826e162e 100644
--- a/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h
+++ b/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h
@@ -1,8 +1,9 @@
 #ifndef _FLEXFLOW_REALM_BACKEND_REALM_TASK_ARGUMENT_ACCESSOR_H
 #define _FLEXFLOW_REALM_BACKEND_REALM_TASK_ARGUMENT_ACCESSOR_H
 
-#include "local-execution/slot_tensor_type_id.dtg.h"
 #include "local-execution/task_argument_accessor.h"
+#include "realm-backend/realm_allocator.h"
+#include "task-spec/slot_tensor_type_id.dtg.h"
 #include <unordered_map>
 #include <variant>
 
@@ -14,7 +15,7 @@ using TensorSlotsBacking = std::unordered_map<
 using ArgSlotsBacking = std::unordered_map<slot_id_t, ConcreteArgSpec>;
 
 struct RealmTaskArgumentAccessor : public ITaskArgumentAccessor {
-  RealmTaskArgumentAccessor(Allocator const &allocator,
+  RealmTaskArgumentAccessor(RealmAllocator const &allocator,
                             TensorSlotsBacking const &tensor_slots_backing,
                             ArgSlotsBacking const &arg_slots_backing);
 
@@ -23,18 +24,18 @@ struct RealmTaskArgumentAccessor : public ITaskArgumentAccessor {
 
   ConcreteArgSpec const &get_concrete_arg(slot_id_t) const override;
 
-  GenericTensorAccessor get_tensor(slot_id_t slot,
-                                   Permissions priv,
+  GenericTensorAccessor get_tensor(slot_id_t slot, Permissions priv,
                                    TensorType tensor_type) const override;
-  VariadicGenericTensorAccessor get_variadic_tensor(
-      slot_id_t slot, Permissions priv, TensorType tensor_type) const override;
+  VariadicGenericTensorAccessor
+  get_variadic_tensor(slot_id_t slot, Permissions priv,
+                      TensorType tensor_type) const override;
 
   Allocator get_allocator() const override;
 
   size_t get_device_idx() const override;
 
 private:
-  Allocator allocator;
+  RealmAllocator allocator;
   TensorSlotsBacking tensor_slots_backing;
   ArgSlotsBacking arg_slots_backing;
 };
@@ -45,8 +46,8 @@ using TensorSlotsBackingWithoutAddresses = std::unordered_map<
                  std::vector<std::pair<ArrayShape, DataType>>>>;
 
 TensorSlotsBackingWithoutAddresses
-    get_slots_backing_without_tensor_allocation_addresses(
-        TensorSlotsBacking const &);
+get_slots_backing_without_tensor_allocation_addresses(
+    TensorSlotsBacking const &);
 
 CHECK_RC_COPY_VIRTUAL_COMPLIANT(RealmTaskArgumentAccessor);
 
diff --git a/lib/realm-backend/include/realm-backend/realm_tensor_backing.h b/lib/realm-backend/include/realm-backend/realm_tensor_backing.h
index d9df0dfcb1..25136ad2ff 100644
--- a/lib/realm-backend/include/realm-backend/realm_tensor_backing.h
+++ b/lib/realm-backend/include/realm-backend/realm_tensor_backing.h
@@ -3,58 +3,70 @@
 #define _FLEXFLOW_REALM_BACKEND_REALM_TENSOR_BACKING_H
 
 #include "kernels/accessor.h"
-#include "realm-backend/realm_task_argument_accessor.h"
-#include "realm-backend/realm_allocator.h"
-#include "local-execution/task_invocation.dtg.h"
-#include "local-execution/tensor_role.dtg.h"
-#include "local-execution/lowered_tensor_t.dtg.h"
+#include "local-execution/gradient_tensor_source.h"
+#include "local-execution/loss_tensor_source.h"
 #include "local-execution/lowered_tensor_source.h"
-#include "local-execution/optimizer_tensor_t.dtg.h"
-#include "local-execution/loss_tensor_t.dtg.h"
+#include "local-execution/optimizer_tensor_source.h"
 #include "pcg/computation_graph.dtg.h"
-#include "pcg/tensor_guid_t.dtg.h"
 #include "pcg/layer_guid_t.dtg.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "realm-backend/allocated_tensors.dtg.h"
+#include "realm-backend/realm_allocator.h"
+#include "realm-backend/realm_task_argument_accessor.h"
+#include "realm-backend/unallocated_tensors.dtg.h"
+#include "task-spec/lowered_tensor_t.dtg.h"
+#include "task-spec/task_invocation.dtg.h"
+#include "task-spec/tensor_role.dtg.h"
 
 namespace FlexFlow {
 
-using TensorRegionMap =
-    std::unordered_map<lowered_tensor_t, RealmRegion>;
-using TensorShapeMap =
-    std::unordered_map<lowered_tensor_t, TensorShape>;
+using TensorBackingMap = std::unordered_map<lowered_tensor_t, std::pair<RealmRegion, TensorShape>>;
 
 struct RealmTensorBacking {
-  RealmTensorBacking();
+  RealmTensorBacking(AllocatedTensors const &, UnallocatedTensors const &,
+                     RealmAllocator const &);
 
 public:
-  void allocate_layer_tensors(layer_guid_t const &,
-                              ComputationGraph const &,
-                              RealmAllocator &);
-  void allocate_tensors_by_role(TensorRole const &,
-                                layer_guid_t const &,
-                                ComputationGraph const &,
-                                RealmAllocator &);
-  void allocate_optimizer_tensors(tensor_guid_t const &,
-                                  std::vector<optimizer_tensor_t> const &,
-                                  RealmAllocator &);
-  TensorSlotsBacking
-      construct_tensor_slots_backing(TaskBinding const &) const;
-
-  GenericTensorAccessorW const &
-      get_tensor_backing(lowered_tensor_t const &) const;
-
-  bool is_tensor_allocated(lowered_tensor_t const &) const;
+  GenericTensorAccessorW get_tensor(TensorTypeVariant const &) const;
 
 public:
   // tensors
-  TensorRegionMap tensor_regions;
-  TensorShapeMap tensor_shapes;
+  TensorBackingMap tensor_backings;
+
   std::unordered_map<tensor_guid_t, lowered_tensor_t> tensor_lowering_mapping;
-  std::unordered_map<tensor_guid_t, lowered_tensor_t> gradient_tensor_lowering_mapping;
-  std::unordered_map<optimizer_tensor_t, lowered_tensor_t> optimizer_tensor_lowering_mapping;
-  std::unordered_map<loss_tensor_t, lowered_tensor_t> loss_tensor_lowering_mapping;
+  std::unordered_map<gradient_tensor_t, lowered_tensor_t>
+      gradient_tensor_lowering_mapping;
+  std::unordered_map<optimizer_tensor_t, lowered_tensor_t>
+      optimizer_tensor_lowering_mapping;
+  std::unordered_map<loss_tensor_t, lowered_tensor_t>
+      loss_tensor_lowering_mapping;
+
+  std::unordered_map<tensor_guid_t, gradient_tensor_t> tensor_gradient_mapping;
+  std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
+      tensor_optimizer_mapping;
+
+  RealmAllocator allocator;
+
+private:
+  lowered_tensor_t insert_tensor(TensorTypeVariant const &);
   LoweredTensorSource lowered_tensor_source;
 };
 
+GenericTensorAccessorW wrappup_tensor_accessor(std::pair<RealmRegion, TensorShape> const &);
+
+UnallocatedTensors generate_unallocated_tensors(
+    AllocatedTensors const &,
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &,
+    GradientTensorSource &);
+
+UnallocatedTensors generate_unallocated_tensors_with_optimizer(
+    AllocatedTensors const &,
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &,
+    GradientTensorSource &, OptimizerTensorSource &, OptimizerAttrs const &);
+
+TensorSlotsBacking construct_tensor_slots_backing(RealmTensorBacking const &,
+                                                  TaskBinding const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/realm-backend/include/realm-backend/realm_training_backing.h b/lib/realm-backend/include/realm-backend/realm_training_backing.h
index ddd3bb7ed1..81df422b7a 100644
--- a/lib/realm-backend/include/realm-backend/realm_training_backing.h
+++ b/lib/realm-backend/include/realm-backend/realm_training_backing.h
@@ -6,6 +6,7 @@
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
 #include "pcg/computation_graph.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
+#include "realm-backend/allocated_tensors.dtg.h"
 #include "realm-backend/driver.h"
 #include "realm-backend/realm_allocator.h"
 #include "realm-backend/realm_args_backing.h"
@@ -18,28 +19,14 @@ using PerLayerElapsedTime =
     std::unordered_map<layer_guid_t, std::optional<float>>;
 
 struct RealmTrainingBacking {
-  RealmTrainingBacking(ComputationGraph const &, RuntimeArgConfig const &,
-                       Realm::Processor);
-  void register_and_allocate_layer(layer_guid_t const &);
-  void allocate_layer_optimizer_tensors(layer_guid_t const &,
-                                        OptimizerAttrs const &);
+  RealmTrainingBacking(Realm::Processor, AllocatedTensors const &,
+                       ComputationGraph const &, RuntimeArgConfig const &);
 
-  void execute_init(layer_guid_t const &);
-  Future<std::optional<float>> execute_forward(layer_guid_t const &);
-  Future<std::optional<float>> execute_backward(layer_guid_t const &);
-  Future<void> execute_update(layer_guid_t const &, OptimizerAttrs const &);
-  Future<void> compute_loss(LossAttrs const &loss_attrs,
-                            tensor_guid_t const &logit_tensor,
-                            loss_tensor_t const &label_tensor);
-
-  TaskArgumentAccessor get_task_arg_accessor(TaskInvocation const &) const;
-
-  TaskInvocation lower_to_task_invocation(OpTaskInvocation const &,
-                                          layer_guid_t const &) const;
-
-  ComputationGraph computation_graph;
-  TaskRegistry task_registry;
+  RealmTrainingBacking(Realm::Processor, AllocatedTensors const &,
+                       ComputationGraph const &, RuntimeArgConfig const &,
+                       OptimizerAttrs const &);
 
+public:
   // runtime
   Realm::Processor master_proc;
   Realm::Memory master_mem;
@@ -47,18 +34,34 @@ struct RealmTrainingBacking {
   std::unordered_map<Realm::Processor, Realm::Event> proc_events;
   std::vector<RealmAllocator> allocators;
 
-  // storage
   RealmTensorBacking realm_tensor_backing;
   RealmArgsBacking realm_args_backing;
-  OptimizerTensorSource optimizer_tensor_source;
-  std::unordered_map<layer_guid_t, std::vector<optimizer_tensor_t>>
-      layer_optimizer_tensor_ids;
 
-private:
-  std::optional<float> call_task_impl(task_id_t, TaskSignatureAndImpl,
-                                      TaskArgumentAccessor);
+  ComputationGraph computation_graph;
+  TaskRegistry task_registry;
+
+  GradientTensorSource gradient_tensor_source;
+  OptimizerTensorSource optimizer_tensor_source;
 };
 
+RealmArgsBacking initialize_args_backing(RealmTrainingBacking *,
+                                        RuntimeArgConfig const &);
+
+void execute_init(RealmTrainingBacking &, layer_guid_t const &);
+Future<std::optional<float>> execute_forward(RealmTrainingBacking &,
+                                             layer_guid_t const &);
+Future<std::optional<float>> execute_backward(RealmTrainingBacking &,
+                                              layer_guid_t const &);
+Future<void> compute_loss(RealmTrainingBacking &, LossAttrs const &,
+                          tensor_guid_t const &logit_tensor,
+                          loss_tensor_t const &label_tensor);
+Future<void> execute_update(RealmTrainingBacking &, layer_guid_t const &,
+                            OptimizerAttrs const &);
+
+TaskArgumentAccessor get_task_arg_accessor(RealmTensorBacking const &,
+                                           RealmArgsBacking const &,
+                                           TaskInvocation const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/realm-backend/include/realm-backend/task_result.h b/lib/realm-backend/include/realm-backend/task_result.h
index 5fb158496e..4cf8916f85 100644
--- a/lib/realm-backend/include/realm-backend/task_result.h
+++ b/lib/realm-backend/include/realm-backend/task_result.h
@@ -3,6 +3,7 @@
 
 #include "realm-backend/driver.h"
 #include <cassert>
+#include <memory>
 
 namespace FlexFlow {
 
diff --git a/lib/realm-backend/include/realm-backend/unallocated_tensors.struct.toml b/lib/realm-backend/include/realm-backend/unallocated_tensors.struct.toml
new file mode 100644
index 0000000000..e86cc2a532
--- /dev/null
+++ b/lib/realm-backend/include/realm-backend/unallocated_tensors.struct.toml
@@ -0,0 +1,31 @@
+namespace = "FlexFlow"
+name = "UnallocatedTensors"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+]
+
+includes = [
+  "task-spec/tensor_type_t.dtg.h",
+  "op-attrs/tensor_shape.dtg.h"
+]
+
+src_includes = [
+  "utils/hash/unordered_map.h",
+  "utils/fmt/unordered_map.h",
+  "utils/hash/vector.h",
+  "utils/fmt/vector.h"
+]
+
+[[fields]]
+name = "tensor_type_shapes"
+type = "std::unordered_map<::FlexFlow::TensorTypeVariant, ::FlexFlow::TensorShape>"
+
+[[fields]]
+name = "gradient_mapping"
+type = "std::unordered_map<::FlexFlow::tensor_guid_t, ::FlexFlow::gradient_tensor_t>"
+
+[[fields]]
+name = "optimizer_mapping"
+type = "std::unordered_map<::FlexFlow::tensor_guid_t, std::vector<::FlexFlow::optimizer_tensor_t>>"
diff --git a/lib/realm-backend/src/allocated_tensors.cc b/lib/realm-backend/src/allocated_tensors.cc
new file mode 100644
index 0000000000..f27db14643
--- /dev/null
+++ b/lib/realm-backend/src/allocated_tensors.cc
@@ -0,0 +1,141 @@
+#include "realm-backend/allocated_tensors.h"
+#include "pcg/optimizer_attrs.h"
+#include "utils/containers/keys.h"
+#include "utils/containers/set_union.h"
+
+namespace FlexFlow {
+
+bool is_allocated_tensor_backing_valid(
+    TensorTypeVariant const &tensor_type,
+    std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> const
+        &allocated_tensor_backings,
+    ArrayShape const &expected_shape) {
+  if (allocated_tensor_backings.count(tensor_type)) {
+    GenericTensorAccessorW tensor_backing =
+        allocated_tensor_backings.at(tensor_type);
+    if (expected_shape == tensor_backing.shape) {
+      return true;
+    }
+  }
+  return false;
+};
+
+bool are_allocated_forward_tensors_valid(
+    AllocatedTensors const &allocated_tensors,
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs) {
+
+  std::unordered_set<tensor_guid_t> all_tensor_guids = transform(
+      keys(filter_keys(
+          allocated_tensors.tensor_type_backings,
+          [&](TensorTypeVariant const &k) { return k.has<tensor_guid_t>(); })),
+      [&](TensorTypeVariant const &t) { return t.get<tensor_guid_t>(); });
+
+  for (tensor_guid_t const &tensor_guid : all_tensor_guids) {
+    if (tensor_attrs.count(tensor_guid)) {
+      if (!is_allocated_tensor_backing_valid(
+              TensorTypeVariant{tensor_guid},
+              allocated_tensors.tensor_type_backings,
+              ArrayShape{tensor_attrs.at(tensor_guid).shape})) {
+        return false;
+      }
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool are_allocated_gradient_tensors_valid(
+    AllocatedTensors const &allocated_tensors,
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs) {
+  std::unordered_set<TensorTypeVariant>
+      tensors_in_mappings; // will check for dangling gradient tensors
+
+  for (std::pair<tensor_guid_t, gradient_tensor_t> const &tensor_to_grad :
+       allocated_tensors.gradient_mapping) {
+    if (tensor_attrs.count(tensor_to_grad.first)) {
+      if (tensor_attrs.at(tensor_to_grad.first).create_gradients ==
+          CreateGrad::NO) {
+        return false;
+      }
+
+      ArrayShape tensor_guid_array_shape =
+          ArrayShape{tensor_attrs.at(tensor_to_grad.first).shape};
+      TensorTypeVariant gradient_tensor =
+          TensorTypeVariant{tensor_to_grad.second};
+      if (is_allocated_tensor_backing_valid(
+              gradient_tensor,
+              allocated_tensors.tensor_type_backings,
+              tensor_guid_array_shape)) {
+        tensors_in_mappings.insert(gradient_tensor);
+      } else {
+        return false;
+      }
+    } else {
+      return false;
+    }
+  }
+
+  for (TensorTypeVariant const &tensor_type :
+       keys(allocated_tensors.tensor_type_backings)) {
+    if (tensor_type.has<gradient_tensor_t>()) {
+      if (!tensors_in_mappings.count(tensor_type)) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+bool are_allocated_optimizer_tensors_valid(
+    AllocatedTensors const &allocated_tensors,
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs) {
+  std::unordered_set<TensorTypeVariant>
+      tensors_in_mappings; // will check for dangling optimizer tensors
+
+  for (std::pair<tensor_guid_t, std::vector<optimizer_tensor_t>> const
+           &tensor_to_optimizers : allocated_tensors.optimizer_mapping) {
+    if (tensor_attrs.count(tensor_to_optimizers.first)) {
+      if (tensor_attrs.at(tensor_to_optimizers.first).create_gradients ==
+          CreateGrad::NO) {
+        return false;
+      }
+
+      ArrayShape tensor_guid_array_shape =
+          ArrayShape{tensor_attrs.at(tensor_to_optimizers.first).shape};
+      for (optimizer_tensor_t const &optimizer_tensor :
+           tensor_to_optimizers.second) {
+        if (is_allocated_tensor_backing_valid(
+                TensorTypeVariant{optimizer_tensor},
+                allocated_tensors.tensor_type_backings,
+                tensor_guid_array_shape)) {
+          tensors_in_mappings.insert(TensorTypeVariant{optimizer_tensor});
+        } else {
+          return false;
+        }
+      }
+    }
+  }
+
+  for (TensorTypeVariant const &tensor_type :
+       keys(allocated_tensors.tensor_type_backings)) {
+    if (tensor_type.has<optimizer_tensor_t>()) {
+      if (!tensors_in_mappings.count(tensor_type)) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+bool are_allocated_tensors_valid(
+    AllocatedTensors const &allocated_tensors,
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs) {
+  return are_allocated_forward_tensors_valid(allocated_tensors, tensor_attrs) &&
+         are_allocated_gradient_tensors_valid(allocated_tensors,
+                                              tensor_attrs) &&
+         are_allocated_optimizer_tensors_valid(allocated_tensors, tensor_attrs);
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-backend/src/model_training_instance.cc b/lib/realm-backend/src/model_training_instance.cc
index f9c959c389..acb8edb314 100644
--- a/lib/realm-backend/src/model_training_instance.cc
+++ b/lib/realm-backend/src/model_training_instance.cc
@@ -5,43 +5,27 @@
 
 namespace FlexFlow {
 
-ModelTrainingInstance::ModelTrainingInstance(
-    ComputationGraph const &computation_graph,
-    RuntimeArgConfig const &runtime_arg_config, LossAttrs const &loss_attrs,
-    tensor_guid_t const &logit_tensor, loss_tensor_t const &label_tensor,
+  ModelTrainingInstance::ModelTrainingInstance(
+    RealmTrainingBacking const &realm_training_backing,
+    tensor_guid_t const &logit_tensor,
+    loss_tensor_t const &label_tensor,
+    LossAttrs const &loss_attrs,
     OptimizerAttrs const &optimizer_attrs)
-    : computation_graph(computation_graph),
-      training_backing(computation_graph, runtime_arg_config),
-      loss_attrs(loss_attrs), logit_tensor(logit_tensor),
-      label_tensor(label_tensor), optimizer_attrs(optimizer_attrs) {
+    : training_backing(realm_training_backing), loss_attrs(loss_attrs),
+      optimizer_attrs(optimizer_attrs), logit_tensor(logit_tensor),
+      label_tensor(label_tensor){};
 
-  // allocate each layer's tensors
-  for (layer_guid_t const &node :
-       topological_ordering(this->computation_graph)) {
-    this->training_backing.register_and_allocate_layer(node);
-    this->training_backing.allocate_layer_optimizer_tensors(
-        node, this->optimizer_attrs);
-  }
-}
-
-void ModelTrainingInstance::execute_init() {
-  for (layer_guid_t const &node :
-       topological_ordering(this->computation_graph)) {
-    this->training_backing.execute_init(node);
-  }
-}
-
-PerLayerElapsedTime ModelTrainingInstance::execute_forward() {
+PerLayerElapsedTime forward(ModelTrainingInstance &model_training_instance) {
   PerLayerElapsedTime per_layer_elapsed_time;
   std::unordered_map<layer_guid_t, Future<std::optional<float>>>
       per_layer_elapsed_time_future;
-  for (layer_guid_t const &node :
-       topological_ordering(this->computation_graph)) {
+  for (layer_guid_t const &node : topological_ordering(
+           model_training_instance.training_backing.computation_graph)) {
     per_layer_elapsed_time_future.insert(
-        {node, this->training_backing.execute_forward(node)});
+        {node, execute_forward(model_training_instance.training_backing, node)});
   }
-  for (layer_guid_t const &node :
-       topological_ordering(this->computation_graph)) {
+  for (layer_guid_t const &node : topological_ordering(
+           model_training_instance.training_backing.computation_graph)) {
     std::optional<float> elapsed_time =
         per_layer_elapsed_time_future[node].get();
     per_layer_elapsed_time.insert({node, elapsed_time});
@@ -49,19 +33,22 @@ PerLayerElapsedTime ModelTrainingInstance::execute_forward() {
   return per_layer_elapsed_time;
 }
 
-PerLayerElapsedTime ModelTrainingInstance::execute_backward() {
-  this->training_backing.compute_loss(this->loss_attrs, this->logit_tensor,
-                                      this->label_tensor);
+PerLayerElapsedTime backward(ModelTrainingInstance &model_training_instance) {
+  compute_loss(model_training_instance.training_backing,
+               model_training_instance.loss_attrs,
+               model_training_instance.logit_tensor,
+               model_training_instance.label_tensor);
+
   PerLayerElapsedTime per_layer_elapsed_time;
   std::unordered_map<layer_guid_t, Future<std::optional<float>>>
       per_layer_elapsed_time_future;
-  for (layer_guid_t const &node :
-       reversed(topological_ordering(this->computation_graph))) {
+  for (layer_guid_t const &node : reversed(topological_ordering(
+           model_training_instance.training_backing.computation_graph))) {
     per_layer_elapsed_time_future.insert(
-        {node, this->training_backing.execute_backward(node)});
+        {node, execute_backward(model_training_instance.training_backing, node)});
   }
-  for (layer_guid_t const &node :
-       reversed(topological_ordering(this->computation_graph))) {
+  for (layer_guid_t const &node : reversed(topological_ordering(
+           model_training_instance.training_backing.computation_graph))) {
     std::optional<float> elapsed_time =
         per_layer_elapsed_time_future[node].get();
     per_layer_elapsed_time.insert({node, elapsed_time});
@@ -69,19 +56,21 @@ PerLayerElapsedTime ModelTrainingInstance::execute_backward() {
   return per_layer_elapsed_time;
 }
 
-void ModelTrainingInstance::execute_update() {
+void update(ModelTrainingInstance &model_training_instance) {
   std::unordered_map<layer_guid_t, Future<void>> per_layer_update_future;
-  for (layer_guid_t const &node :
-       topological_ordering(this->computation_graph)) {
+  for (layer_guid_t const &node : topological_ordering(
+           model_training_instance.training_backing.computation_graph)) {
     per_layer_update_future.insert(
-        {node, this->training_backing.execute_update(node, this->optimizer_attrs)});
+        {node, execute_update(model_training_instance.training_backing,
+                   node,
+                   model_training_instance.optimizer_attrs)});
   }
-  for (layer_guid_t const &node :
-       topological_ordering(this->computation_graph)) {
+  for (layer_guid_t const &node : topological_ordering(
+           model_training_instance.training_backing.computation_graph)) {
     per_layer_update_future[node].wait();
   }
-  this->optimizer_attrs =
-      get_optimizer_attrs_for_next_iter(this->optimizer_attrs);
+  model_training_instance.optimizer_attrs = get_optimizer_attrs_for_next_iter(
+    model_training_instance.optimizer_attrs);
 }
 
 } // namespace FlexFlow
diff --git a/lib/realm-backend/src/realm_args_backing.cc b/lib/realm-backend/src/realm_args_backing.cc
index ae7022f4b0..e20fcdc14d 100644
--- a/lib/realm-backend/src/realm_args_backing.cc
+++ b/lib/realm-backend/src/realm_args_backing.cc
@@ -1,65 +1,55 @@
 #include "op-attrs/parallel_tensor_shape.h"
 #include "realm-backend/realm_args_backing.h"
+#include "task-spec/op_task_to_task_invocation.h"
 #include "utils/containers/contains_key.h"
 #include "utils/containers/map_values.h"
 #include "utils/overload.h"
 
 namespace FlexFlow {
 
-void RealmArgsBacking::add_per_device_op_state(
-    layer_guid_t const &op_guid, Future<DeviceSpecificDeviceStates> &&future) {
-  if (per_device_op_states.find(op_guid) != per_device_op_states.end()) {
-    throw mk_runtime_error("Op state already exists");
-  }
-  per_device_op_states.insert({op_guid, std::move(future)});
-}
+// void RealmArgsBacking::add_per_device_op_state(
+//     layer_guid_t const &op_guid, Future<DeviceSpecificDeviceStates> &&future)
+//     {
+//   if (per_device_op_states.find(op_guid) != per_device_op_states.end()) {
+//     throw mk_runtime_error("Op state already exists");
+//   }
+//   per_device_op_states.insert({op_guid, std::move(future)});
+// }
 
-ArgSlotsBacking RealmArgsBacking::construct_arg_slots_backing(
-    TaskBinding const &binding) const {
-  return map_values(binding.get_arg_bindings(),
-                    [&](TaskArgSpec const &arg_binding) {
-                      return arg_binding.template visit<ConcreteArgSpec>(
-                          overload{[&](RuntimeArgRefSpec const &s) {
-                                     return this->lower_to_concrete_arg_spec(s);
-                                   },
-                                   [](ConcreteArgSpec const &s) { return s; }});
-                    });
-  ;
+RealmArgsBacking make_args_backing_with_empty_device_states(
+  RuntimeArgConfig const &runtime_arg_config) {
+return RealmArgsBacking{runtime_arg_config, {}};
 }
 
-ConcreteArgSpec RealmArgsBacking::lower_to_concrete_arg_spec(
-    OpArgRefSpec const &op_arg_ref_spec, ComputationGraph const &cg,
-    layer_guid_t const &op_guid) const {
-  if (op_arg_ref_spec.holds<DeviceSpecificDeviceStates>()) {
-    assert(contains_key(this->per_device_op_states, op_guid));
-    DeviceSpecificDeviceStates device_specific =
-        per_device_op_states.at(op_guid);
-    PerDeviceOpState device_state =
-        get_device_state_from_device_specific(device_specific, 0);
-    return ConcreteArgSpec::create(device_state);
-  } else if (op_arg_ref_spec.holds<ParallelTensorShape>()) {
-    ParallelTensorShapeRefType index_op_arg_ref =
-        op_arg_ref_spec.get_ref_type().get<ParallelTensorShapeRefType>();
-    tensor_guid_t input_tensor =
-        get_incoming_inputs(cg, op_guid).at(index_op_arg_ref.idx);
-    TensorAttrs tensor_attrs = get_tensor_attrs(cg, input_tensor);
-    ParallelTensorShape shape = lift_to_parallel(tensor_attrs.shape);
-    return ConcreteArgSpec::create(shape);
+RealmArgsBacking::RealmArgsBacking(
+  RuntimeArgConfig const &runtime_arg_config,
+  std::unordered_map<layer_guid_t, DeviceSpecificDeviceStates> const
+      &device_states)
+  : runtime_arg_config(runtime_arg_config),
+    per_device_op_states(device_states){};
+
+std::optional<DeviceSpecificDeviceStates> get_per_device_op_state_if_exists(
+  RealmArgsBacking const &realm_args_backing,
+    layer_guid_t const &layer_guid) {
+  if (contains_key(realm_args_backing.per_device_op_states, layer_guid)) {
+    return realm_args_backing.per_device_op_states.at(layer_guid);
   } else {
-    throw mk_runtime_error("Unhandled op arg ref type");
+    return std::nullopt;
   }
 }
 
-ConcreteArgSpec RealmArgsBacking::lower_to_concrete_arg_spec(
-    RuntimeArgRefSpec const &runtime_arg_ref_spec) const {
-  if (runtime_arg_ref_spec.holds<DeviceSpecific<PerDeviceFFHandle>>()) {
-    return ConcreteArgSpec::create(
-        *(this->runtime_arg_config.ff_handle.get(0)));
-  } else if (runtime_arg_ref_spec.holds<ProfilingSettings>()) {
-    return ConcreteArgSpec::create(this->runtime_arg_config.profiling_settings);
-  } else {
-    throw mk_runtime_error("Unhandled runtime arg ref type");
-  }
+ArgSlotsBacking
+    construct_arg_slots_backing(TaskBinding const &binding,
+                                RuntimeArgConfig const &runtime_arg_config) {
+  return map_values(
+      binding.get_arg_bindings(), [&](TaskArgSpec const &arg_binding) {
+        return arg_binding.template visit<ConcreteArgSpec>(
+            overload{[&](RuntimeArgRefSpec const &s) {
+                       return lower_to_concrete_arg_spec(s, runtime_arg_config);
+                     },
+                     [](ConcreteArgSpec const &s) { return s; }});
+      });
+  ;
 }
 
 } // namespace FlexFlow
diff --git a/lib/realm-backend/src/realm_tensor_backing copy.cc b/lib/realm-backend/src/realm_tensor_backing copy.cc
new file mode 100644
index 0000000000..bac16c6b69
--- /dev/null
+++ b/lib/realm-backend/src/realm_tensor_backing copy.cc	
@@ -0,0 +1,142 @@
+#include "task-spec/slot_grad_id.dtg.h"
+
+#include "op-attrs/parallel_tensor_shape.h"
+#include "pcg/computation_graph.h"
+#include "pcg/optimizer_attrs.h"
+#include "realm-backend/realm_allocator.h"
+#include "realm-backend/realm_tensor_backing.h"
+#include "utils/containers/contains_key.h"
+#include "utils/containers/keys.h"
+#include "utils/overload.h"
+
+namespace FlexFlow {
+
+RealmTensorBacking::RealmTensorBacking() {};
+
+void RealmTensorBacking::allocate_layer_tensors(
+    layer_guid_t const &layer_guid, ComputationGraph const &computation_graph,
+    RealmAllocator &allocator) {
+  this->allocate_tensors_by_role(TensorRole::INPUT, layer_guid,
+                                 computation_graph, allocator);
+  this->allocate_tensors_by_role(TensorRole::WEIGHT, layer_guid,
+                                 computation_graph, allocator);
+  this->allocate_tensors_by_role(TensorRole::OUTPUT, layer_guid,
+                                 computation_graph, allocator);
+}
+
+void RealmTensorBacking::allocate_tensors_by_role(
+    TensorRole const &role, layer_guid_t const &layer_guid,
+    ComputationGraph const &computation_graph, RealmAllocator &allocator) {
+  std::vector<tensor_guid_t> tensors;
+  switch (role) {
+  case TensorRole::INPUT:
+    tensors = get_incoming_inputs(computation_graph, layer_guid);
+    break;
+  case TensorRole::WEIGHT:
+    tensors = get_incoming_weights(computation_graph, layer_guid);
+    break;
+  case TensorRole::OUTPUT:
+    tensors = get_outgoing_tensors(computation_graph, layer_guid);
+    break;
+  default:
+    throw mk_runtime_error("Invalid tensor role, got {}", role);
+  }
+
+  for (tensor_guid_t const &tensor : tensors) {
+    TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor);
+    // tensor allocation
+    if (!contains_key(this->tensor_lowering_mapping, tensor)) {
+      lowered_tensor_t reduced_tensor =
+          this->lowered_tensor_source.new_lowered_tensor();
+      this->tensor_lowering_mapping.insert({tensor, reduced_tensor});
+      RealmRegion region =
+          allocator.allocate(get_size_in_bytes(tensor_attrs.shape));
+      this->tensor_regions.insert({reduced_tensor, region});
+      this->tensor_shapes.insert({reduced_tensor, tensor_attrs.shape});
+    }
+
+    // gradient tensor allocation
+    if (tensor_attrs.create_gradients == CreateGrad::YES &&
+        !contains_key(this->gradient_tensor_lowering_mapping, tensor)) {
+      lowered_tensor_t reduced_tensor =
+          this->lowered_tensor_source.new_lowered_tensor();
+      this->gradient_tensor_lowering_mapping.insert({tensor, reduced_tensor});
+      RealmRegion region =
+          allocator.allocate(get_size_in_bytes(tensor_attrs.shape));
+      this->tensor_regions.insert({reduced_tensor, region});
+      this->tensor_shapes.insert({reduced_tensor, tensor_attrs.shape});
+    }
+  }
+}
+
+void RealmTensorBacking::allocate_optimizer_tensors(
+    tensor_guid_t const &weight,
+    std::vector<optimizer_tensor_t> const &optimizer_tensors,
+    RealmAllocator &allocator) {
+  GenericTensorAccessorW weight_backing =
+      this->get_tensor_backing(this->tensor_lowering_mapping.at(weight));
+  for (optimizer_tensor_t const &optimizer_tensor : optimizer_tensors) {
+    // optimizer tensor allocation
+    if (!contains_key(this->optimizer_tensor_lowering_mapping,
+                      optimizer_tensor)) {
+      lowered_tensor_t buffer_tensor =
+          this->lowered_tensor_source.new_lowered_tensor();
+      this->optimizer_tensor_lowering_mapping.insert(
+          {optimizer_tensor, buffer_tensor});
+      TensorShape tensor_shape =
+          get_tensor_shape(weight_backing.shape, weight_backing.data_type);
+      RealmRegion region = allocator.allocate(get_size_in_bytes(tensor_shape));
+      this->tensor_regions.insert({buffer_tensor, region});
+      this->tensor_shapes.insert({buffer_tensor, tensor_shape});
+    }
+  }
+}
+
+bool RealmTensorBacking::is_tensor_allocated(
+    lowered_tensor_t const &tensor_id) const {
+  return contains_key(tensor_regions, tensor_id);
+}
+
+GenericTensorAccessorW const &RealmTensorBacking::get_tensor_backing(
+    lowered_tensor_t const &tensor_id) const {
+  void *ptr = this->tensor_regions.at(tensor_id).instance.pointer_untyped(0, 0);
+  TensorShape shape = this->tensor_shapes.at(tensor_id);
+  return {shape.data_type, ArrayShape{shape}, ptr};
+}
+
+TensorSlotsBacking RealmTensorBacking::construct_tensor_slots_backing(
+    TaskBinding const &binding) const {
+  TensorSlotsBacking mapping;
+
+  for (auto const &tensor_binding : binding.get_tensor_bindings()) {
+    SlotTensorTypeId slot_tensor_type_id = tensor_binding.first;
+
+    lowered_tensor_t tensor_id = [&] {
+      TensorTypeVariant tensor_type = tensor_binding.second;
+      if (tensor_type.has<tensor_guid_t>() and
+          slot_tensor_type_id.tensor_type == TensorType::FORWARD) {
+        return this->tensor_lowering_mapping.at(
+            tensor_type.get<tensor_guid_t>());
+      } else if (tensor_type.has<tensor_guid_t>() and
+                 slot_tensor_type_id.tensor_type == TensorType::GRADIENT) {
+        return this->gradient_tensor_lowering_mapping.at(
+            tensor_type.get<tensor_guid_t>());
+      } else if (tensor_type.has<optimizer_tensor_t>()) {
+        return this->optimizer_tensor_lowering_mapping.at(
+            tensor_type.get<optimizer_tensor_t>());
+      } else if (tensor_type.has<loss_tensor_t>()) {
+        return this->loss_tensor_lowering_mapping.at(
+            tensor_type.get<loss_tensor_t>());
+      } else {
+        throw mk_runtime_error(fmt::format("Tensor binding has invalid type"));
+      }
+    }();
+
+    GenericTensorAccessorW accessor = this->get_tensor_backing(tensor_id);
+    mapping.insert({slot_tensor_type_id, accessor});
+  }
+
+  return mapping;
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-backend/src/realm_tensor_backing.cc b/lib/realm-backend/src/realm_tensor_backing.cc
index 6edf6cf064..8f8f828821 100644
--- a/lib/realm-backend/src/realm_tensor_backing.cc
+++ b/lib/realm-backend/src/realm_tensor_backing.cc
@@ -1,124 +1,220 @@
-#include "realm-backend/realm_tensor_backing.h"
-#include "local-execution/tensor_lowering.h"
 #include "op-attrs/parallel_tensor_shape.h"
-#include "op-attrs/tensor_shape.dtg.h"
+#include "op-attrs/tensor_shape.h"
 #include "pcg/computation_graph.h"
+#include "pcg/optimizer_attrs.h"
+#include "realm-backend/allocated_tensors.h"
 #include "realm-backend/realm_allocator.h"
+#include "realm-backend/realm_tensor_backing.h"
+#include "task-spec/slot_grad_id.dtg.h"
 #include "utils/containers/contains_key.h"
+#include "utils/containers/keys.h"
 #include "utils/overload.h"
-#include "local-execution/slot_grad_id.dtg.h"
 
 namespace FlexFlow {
 
-RealmTensorBacking::RealmTensorBacking() {};
-
-void RealmTensorBacking::allocate_layer_tensors(
-    layer_guid_t const &layer_guid,
-    ComputationGraph const &computation_graph,
-    RealmAllocator &allocator) {
-  this->allocate_tensors_by_role(
-      TensorRole::INPUT, layer_guid, computation_graph, allocator);
-  this->allocate_tensors_by_role(
-      TensorRole::WEIGHT, layer_guid, computation_graph, allocator);
-  this->allocate_tensors_by_role(
-      TensorRole::OUTPUT, layer_guid, computation_graph, allocator);
+GenericTensorAccessorW wrappup_tensor_accessor(
+    std::pair<RealmRegion, TensorShape> const &tensor_region_shape) {
+  void *ptr = tensor_region_shape.first.instance.pointer_untyped(0, 0);
+  TensorShape shape = tensor_region_shape.second;
+  return {shape.data_type, ArrayShape{shape}, ptr};
 }
 
-void RealmTensorBacking::allocate_tensors_by_role(
-    TensorRole const &role,
-    layer_guid_t const &layer_guid,
-    ComputationGraph const &computation_graph,
-    RealmAllocator &allocator) {
-  std::vector<tensor_guid_t> tensors;
-  switch (role) {
-    case TensorRole::INPUT:
-      tensors = get_incoming_inputs(computation_graph, layer_guid);
-      break;
-    case TensorRole::WEIGHT:
-      tensors = get_incoming_weights(computation_graph, layer_guid);
-      break;
-    case TensorRole::OUTPUT:
-      tensors = get_outgoing_tensors(computation_graph, layer_guid);
-      break;
-    default:
-      throw mk_runtime_error("Invalid tensor role, got {}", role);
+RealmTensorBacking::RealmTensorBacking(
+    AllocatedTensors const &allocated_tensors,
+    UnallocatedTensors const &unallocated_tensors,
+    RealmAllocator const &allocator)
+    : tensor_gradient_mapping(allocated_tensors.gradient_mapping),
+      tensor_optimizer_mapping(allocated_tensors.optimizer_mapping),
+      allocator(allocator) {
+
+  // handle already-allocated tensors
+  for (std::pair<TensorTypeVariant, std::pair<RealmRegion, TensorShape>> const
+           &tensor_type_backing : allocated_tensors.tensor_type_backings) {
+    lowered_tensor_t lowered_tensor =
+        this->insert_tensor(tensor_type_backing.first);
+    this->tensor_backings.insert({lowered_tensor, tensor_type_backing.second});
   }
 
-  for (tensor_guid_t const &tensor : tensors) {
-    TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor);
-    // tensor allocation
-    if (!contains_key(this->tensor_lowering_mapping, tensor)) {
-      lowered_tensor_t reduced_tensor = this->lowered_tensor_source.new_lowered_tensor();
-      this->tensor_lowering_mapping.insert({tensor, reduced_tensor});
-      RealmRegion region = allocator.allocate(get_size_in_bytes(tensor_attrs.shape));
-      this->tensor_regions.insert({reduced_tensor, region});
-      this->tensor_shapes.insert({reduced_tensor, tensor_attrs.shape});
-    }
+  // allocate new tensors
+  this->tensor_gradient_mapping.insert(
+      unallocated_tensors.gradient_mapping.begin(),
+      unallocated_tensors.gradient_mapping.end());
 
-    // gradient tensor allocation
-    if (tensor_attrs.create_gradients == CreateGrad::YES && !contains_key(this->gradient_tensor_lowering_mapping, tensor)) {
-      lowered_tensor_t reduced_tensor = this->lowered_tensor_source.new_lowered_tensor();
-      this->gradient_tensor_lowering_mapping.insert({tensor, reduced_tensor});
-      RealmRegion region = allocator.allocate(get_size_in_bytes(tensor_attrs.shape));
-      this->tensor_regions.insert({reduced_tensor, region});
-      this->tensor_shapes.insert({reduced_tensor, tensor_attrs.shape});
+  for (std::pair<tensor_guid_t, std::vector<optimizer_tensor_t>> const
+           &unallocated_optimizer_tensors :
+       unallocated_tensors.optimizer_mapping) {
+    if (this->tensor_optimizer_mapping.count(
+            unallocated_optimizer_tensors.first)) {
+      for (optimizer_tensor_t const &optimizer_tensor :
+           unallocated_optimizer_tensors.second) {
+        this->tensor_optimizer_mapping[unallocated_optimizer_tensors.first]
+            .push_back(optimizer_tensor);
+      }
+    } else {
+      this->tensor_optimizer_mapping.insert({unallocated_optimizer_tensors});
     }
   }
-}
 
-void RealmTensorBacking::allocate_optimizer_tensors(
-    tensor_guid_t const &weight,
-    std::vector<optimizer_tensor_t> const& optimizer_tensors,
-    RealmAllocator &allocator) {
-  GenericTensorAccessorW weight_backing = this->get_tensor_backing(this->tensor_lowering_mapping.at(weight));
-  for (optimizer_tensor_t const & optimizer_tensor: optimizer_tensors) {
-    // optimizer tensor allocation
-    if (!contains_key(this->optimizer_tensor_lowering_mapping, optimizer_tensor)) {
-      lowered_tensor_t buffer_tensor = this->lowered_tensor_source.new_lowered_tensor();
-      this->optimizer_tensor_lowering_mapping.insert({optimizer_tensor, buffer_tensor});
-      TensorShape tensor_shape = get_tensor_shape(weight_backing.shape, weight_backing.data_type);
-      RealmRegion region = allocator.allocate(get_size_in_bytes(tensor_shape));
-      this->tensor_regions.insert({buffer_tensor, region});
-      this->tensor_shapes.insert({buffer_tensor, tensor_shape});
-    }
+  for (std::pair<TensorTypeVariant, TensorShape> const &tensor_type_shape :
+       unallocated_tensors.tensor_type_shapes) {
+    lowered_tensor_t lowered_tensor =
+        this->insert_tensor(tensor_type_shape.first);
+    RealmRegion region = allocator.allocate(
+        get_size_in_bytes(tensor_type_shape.second).unwrap_nonnegative());
+    this->tensor_backings.insert(
+        {lowered_tensor, {region, tensor_type_shape.second}});
   }
+};
+
+lowered_tensor_t
+RealmTensorBacking::insert_tensor(TensorTypeVariant const &tensor_type) {
+  lowered_tensor_t lowered_tensor =
+      this->lowered_tensor_source.new_lowered_tensor();
+  tensor_type.visit<std::nullopt_t>(overload{
+      [&](tensor_guid_t const &tensor_guid) {
+        this->tensor_lowering_mapping.insert({tensor_guid, lowered_tensor});
+        return std::nullopt;
+      },
+      [&](gradient_tensor_t const &gradient_tensor) {
+        this->gradient_tensor_lowering_mapping.insert(
+            {gradient_tensor, lowered_tensor});
+        return std::nullopt;
+      },
+      [&](optimizer_tensor_t const &optimizer_tensor) {
+        this->optimizer_tensor_lowering_mapping.insert(
+            {optimizer_tensor, lowered_tensor});
+        return std::nullopt;
+      },
+      [&](loss_tensor_t const &loss_tensor) {
+        this->loss_tensor_lowering_mapping.insert(
+            {loss_tensor, lowered_tensor});
+        return std::nullopt;
+      },
+      [&](auto const &any_tensor) {
+        throw mk_runtime_error(
+            fmt::format("Unhandled tensor type {}", any_tensor));
+      }});
+  return lowered_tensor;
 }
 
-bool RealmTensorBacking::is_tensor_allocated(lowered_tensor_t const & tensor_id) const {
-  return contains_key(tensor_regions, tensor_id);
+GenericTensorAccessorW
+RealmTensorBacking::get_tensor(TensorTypeVariant const &tensor_type) const {
+  lowered_tensor_t lowered_tensor =
+      tensor_type.visit<lowered_tensor_t>(overload{
+          [&](tensor_guid_t const &tensor_guid) {
+            return this->tensor_lowering_mapping.at(tensor_guid);
+          },
+          [&](gradient_tensor_t const &gradient_tensor) {
+            return this->gradient_tensor_lowering_mapping.at(gradient_tensor);
+          },
+          [&](optimizer_tensor_t const &optimizer_tensor) {
+            return this->optimizer_tensor_lowering_mapping.at(optimizer_tensor);
+          },
+          [&](loss_tensor_t const &loss_tensor) {
+            return this->loss_tensor_lowering_mapping.at(loss_tensor);
+          },
+          [&](auto const &any_tensor) {
+            throw mk_runtime_error(
+                fmt::format("Unhandled tensor type {}", any_tensor));
+          }});
+  return wrappup_tensor_accessor(this->tensor_backings.at(lowered_tensor));
 }
 
-GenericTensorAccessorW const &RealmTensorBacking::get_tensor_backing(
-    lowered_tensor_t const &tensor_id) const {
-  void *ptr = this->tensor_regions.at(tensor_id).instance.pointer_untyped(0, 0);
-  TensorShape shape = this->tensor_shapes.at(tensor_id);
-  return {shape.data_type, ArrayShape{shape}, ptr};
+UnallocatedTensors generate_unallocated_tensors(
+    AllocatedTensors const &allocated_tensors,
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs_mapping,
+    GradientTensorSource &gradient_tensor_source) {
+
+  assert(are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping));
+
+  std::unordered_map<TensorTypeVariant, TensorShape> tensor_type_shapes;
+  std::unordered_map<tensor_guid_t, gradient_tensor_t> gradient_mapping;
+
+  for (std::pair<tensor_guid_t, TensorAttrs> const &tensor_guid_attrs :
+       tensor_attrs_mapping) {
+    tensor_guid_t tensor_guid = tensor_guid_attrs.first;
+    TensorAttrs tensor_attrs = tensor_guid_attrs.second;
+    TensorTypeVariant tensor_guid_type = TensorTypeVariant{tensor_guid};
+    if (!allocated_tensors.tensor_type_backings.count(tensor_guid_type)) {
+      tensor_type_shapes.insert({tensor_guid_type, tensor_attrs.shape});
+    }
+
+    if (tensor_attrs.create_gradients == CreateGrad::YES &&
+        !allocated_tensors.gradient_mapping.count(tensor_guid)) {
+      gradient_tensor_t gradient_tensor =
+          gradient_tensor_source.new_gradient_tensor();
+      tensor_type_shapes.insert(
+          {TensorTypeVariant{gradient_tensor}, tensor_attrs.shape});
+      gradient_mapping.insert({tensor_guid, gradient_tensor});
+    }
+  }
+
+  return UnallocatedTensors{tensor_type_shapes, gradient_mapping, {}};
 }
 
-TensorSlotsBacking RealmTensorBacking::construct_tensor_slots_backing(
-    TaskBinding const &binding) const {
-  TensorSlotsBacking mapping;
+UnallocatedTensors generate_unallocated_tensors_with_optimizer(
+    AllocatedTensors const &allocated_tensors,
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs_mapping,
+    GradientTensorSource &gradient_tensor_source,
+    OptimizerTensorSource &optimizer_tensor_source,
+    OptimizerAttrs const &optimizer_attrs) {
+
+  UnallocatedTensors unallocated_tensors = generate_unallocated_tensors(
+      allocated_tensors, tensor_attrs_mapping, gradient_tensor_source);
+
+  if (!get_num_optimizer_tensors(optimizer_attrs)) {
+    return unallocated_tensors;
+  }
+
+  std::unordered_map<TensorTypeVariant, TensorShape> tensor_type_shapes =
+      unallocated_tensors.tensor_type_shapes;
+  std::unordered_map<tensor_guid_t, gradient_tensor_t> gradient_mapping =
+      unallocated_tensors.gradient_mapping;
+  std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
+      optimizer_mapping;
+
+  for (std::pair<tensor_guid_t, TensorAttrs> const &tensor_guid_attrs :
+       tensor_attrs_mapping) {
+    tensor_guid_t tensor_guid = tensor_guid_attrs.first;
+    TensorAttrs tensor_attrs = tensor_guid_attrs.second;
+    if (tensor_attrs.create_gradients == CreateGrad::YES) {
+      std::vector<optimizer_tensor_t> optimizer_tensors;
+
+      int num_optimizer_tensors_to_allocate =
+          get_num_optimizer_tensors(optimizer_attrs);
+      if (allocated_tensors.optimizer_mapping.count(tensor_guid)) {
+        num_optimizer_tensors_to_allocate -=
+            allocated_tensors.optimizer_mapping.at(tensor_guid).size();
+      }
+      std::cout << num_optimizer_tensors_to_allocate;
 
-  for (auto const &tensor_binding : binding.get_tensor_bindings()) {
-    SlotTensorTypeId slot_tensor_type_id = tensor_binding.first;
-
-    lowered_tensor_t tensor_id = [&] {
-      TensorTypeVariant tensor_type = tensor_binding.second;
-      if (tensor_type.has<tensor_guid_t>() and slot_tensor_type_id.tensor_type == TensorType::FORWARD) {
-        return this->tensor_lowering_mapping.at(tensor_type.get<tensor_guid_t>());
-      } else if (tensor_type.has<tensor_guid_t>() and slot_tensor_type_id.tensor_type == TensorType::GRADIENT) {
-        return this->gradient_tensor_lowering_mapping.at(tensor_type.get<tensor_guid_t>());
-      } else if (tensor_type.has<optimizer_tensor_t>()) {
-        return this->optimizer_tensor_lowering_mapping.at(tensor_type.get<optimizer_tensor_t>());
-      } else if (tensor_type.has<loss_tensor_t>()) {
-        return this->loss_tensor_lowering_mapping.at(tensor_type.get<loss_tensor_t>());
-      } else {
-        throw mk_runtime_error(fmt::format("Tensor binding has invalid type"));
+      for (int i = 0; i < num_optimizer_tensors_to_allocate; ++i) {
+        optimizer_tensor_t optimizer_tensor =
+            optimizer_tensor_source.new_optimizer_tensor();
+        optimizer_tensors.push_back(optimizer_tensor);
+        tensor_type_shapes.insert(
+            {TensorTypeVariant{optimizer_tensor}, tensor_attrs.shape});
       }
-    }();
 
-    GenericTensorAccessorW accessor = this->get_tensor_backing(tensor_id);
-    mapping.insert({slot_tensor_type_id, accessor});
+      if (num_optimizer_tensors_to_allocate > 0) {
+        optimizer_mapping.insert({tensor_guid, optimizer_tensors});
+      }
+    }
+  }
+
+  return UnallocatedTensors{tensor_type_shapes, gradient_mapping,
+                            optimizer_mapping};
+}
+
+TensorSlotsBacking
+construct_tensor_slots_backing(RealmTensorBacking const &realm_tensor_backing,
+                               TaskBinding const &binding) {
+  TensorSlotsBacking mapping;
+
+  for (std::pair<SlotTensorTypeId, TensorTypeVariant> const &tensor_binding :
+       binding.get_tensor_bindings()) {
+    mapping.insert({tensor_binding.first,
+                    realm_tensor_backing.get_tensor(tensor_binding.second)});
   }
 
   return mapping;
diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc
index 24829a77b1..f6b516e303 100644
--- a/lib/realm-backend/src/realm_training_backing.cc
+++ b/lib/realm-backend/src/realm_training_backing.cc
@@ -1,30 +1,34 @@
 #include "local-execution/loss_functions.h"
 #include "local-execution/optimizer.h"
-#include "local-execution/task_id_t.dtg.h"
-#include "local-execution/task_invocation.h"
 #include "local-execution/task_signature_impl.h"
-#include "local-execution/tensor_lowering.h"
+#include "pcg/computation_graph.dtg.h"
 #include "pcg/computation_graph.h"
 #include "pcg/optimizer_attrs.h"
-#include "realm-backend/realm_training_backing.h"
-#include "realm-backend/task_result.h"
-#include "realm-backend/task_wrapper.h"
+#include "realm-backend/realm_tensor_backing.h"
+#include "task-spec/op_task_to_task_invocation.h"
+#include "task-spec/runtime_arg_config.h"
+#include "task-spec/task_invocation.h"
 #include "utils/containers/contains.h"
 #include "utils/containers/contains_key.h"
 #include "utils/containers/get_only.h"
 #include "utils/containers/values.h"
 #include "utils/exception.h"
 
+#include "realm-backend/realm_training_backing.h"
+#include "realm-backend/task_result.h"
+#include "realm-backend/task_wrapper.h"
+
 namespace FlexFlow {
 
 using namespace Realm;
 
 RealmTrainingBacking::RealmTrainingBacking(
+    Processor master_proc, AllocatedTensors const &allocated_tensors,
     ComputationGraph const &computation_graph,
-    RuntimeArgConfig const &runtime_arg_config, Realm::Processor master_proc)
+    RuntimeArgConfig const &runtime_arg_config)
     : computation_graph(computation_graph),
-      realm_args_backing(runtime_arg_config),
-      task_registry(empty_task_registry()) {
+      task_registry(construct_task_registry(
+          get_layer_attrs_mapping(this->computation_graph)))) {
   master_proc = master_proc;
   proc_events.insert({master_proc, Realm::Event::NO_EVENT});
   master_mem = Machine::MemoryQuery(Machine::get_machine())
@@ -36,95 +40,212 @@ RealmTrainingBacking::RealmTrainingBacking(
   for (Processor p : pq) {
     worker_procs.push_back(p);
     proc_events.insert({p, Realm::Event::NO_EVENT});
-    allocators.push_back(RealmAllocator(p));
+    allocators.push_back(RealmAllocator::create<RealmAllocatorImpl>(p));
   }
   assert(worker_procs.size() > 0);
-}
 
-void RealmTrainingBacking::register_and_allocate_layer(
-    layer_guid_t const &node) {
-  ComputationGraphOpAttrs attrs =
-      get_layer_attrs(this->computation_graph, node).attrs;
-  this->realm_tensor_backing.allocate_layer_tensors(
-      node, this->computation_graph, this->allocators[0]);
-  register_tasks_for_layer(this->task_registry, node, attrs);
-  // TODO: multi gpu launching
-  std::vector<task_id_t> task_ids = get_task_ids(attrs);
-  for (task_id_t task_id : task_ids) {
-    TaskSignatureAndImpl task_signature_impl =
-        this->task_registry.task_mapping.at(task_id);
-    register_wrapper_tasks(worker_procs[0], task_id, task_signature_impl);
+  // register tasks for realm
+  for (layer_guid_t const &node :
+       topological_ordering(this->computation_graph)) {
+    ComputationGraphOpAttrs attrs =
+        get_layer_attrs(this->computation_graph, node).attrs;
+    if (attrs.has<OpTaskInvocation>()) {
+      OpTaskInvocation op_task_invocation = attrs.get<OpTaskInvocation>();
+      std::vector<task_id_t> task_ids = get_task_ids(attrs);
+      for (task_id_t task_id : task_ids) {
+        TaskSignatureAndImpl task_signature_impl =
+            this->task_registry.task_mapping.at(task_id);
+        // TODO: multi gpu
+        register_wrapper_tasks(worker_procs[0], task_id, task_signature_impl);
+      }
+    }
   }
+
+  // TODO: multi gpu
+  realm_tensor_backing = RealmTensorBacking(
+      allocated_tensors,
+      generate_unallocated_tensors(
+          allocated_tensors, get_all_tensor_attrs(this->computation_graph),
+          this->gradient_tensor_source),
+      allocators[0]);
+  realm_args_backing =
+      initialize_args_backing(this->task_registry, this->computation_graph,
+                              runtime_arg_config, this->realm_tensor_backing);
 }
 
-void RealmTrainingBacking::allocate_layer_optimizer_tensors(
-    layer_guid_t const &node, OptimizerAttrs const &optimizer_attrs) {
-  ComputationGraphOpAttrs attrs =
-      get_layer_attrs(this->computation_graph, node).attrs;
-  if (attrs.has<WeightAttrs>()) {
-    TaskSignature sig = get_update_signature(optimizer_attrs);
-    tensor_guid_t weight_tensor =
-        get_only(get_outgoing_tensors(this->computation_graph, node));
+RealmTrainingBacking::RealmTrainingBacking(
+  Processor master_proc, AllocatedTensors const &allocated_tensors,
+  ComputationGraph const &computation_graph,
+  RuntimeArgConfig const &runtime_arg_config,
+  OptimizerAttrs const &optimizer_attrs)
+  : computation_graph(computation_graph),
+    task_registry(construct_task_registry(
+        get_layer_attrs_mapping(this->computation_graph)))) {
+  master_proc = master_proc;
+  proc_events.insert({master_proc, Realm::Event::NO_EVENT});
+  master_mem = Machine::MemoryQuery(Machine::get_machine())
+                   .only_kind(Memory::SYSTEM_MEM)
+                   .best_affinity_to(master_proc)
+                   .first();
+  Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine())
+                                   .only_kind(Processor::TOC_PROC);
+  for (Processor p : pq) {
+    worker_procs.push_back(p);
+    proc_events.insert({p, Realm::Event::NO_EVENT});
+    allocators.push_back(RealmAllocator::create<RealmAllocatorImpl>(p));
+  }
+  assert(worker_procs.size() > 0);
 
-    std::vector<optimizer_tensor_t> optimizer_tensors;
-    for (TensorTypeSlotSpec const &tensor_type_slot_spec :
-         values(sig.tensor_guid_slots)) {
-      optimizer_tensors.push_back(
-          this->optimizer_tensor_source.new_optimizer_tensor());
+  // register tasks for realm
+  for (layer_guid_t const &node :
+       topological_ordering(this->computation_graph)) {
+    ComputationGraphOpAttrs attrs =
+        get_layer_attrs(this->computation_graph, node).attrs;
+    if (attrs.has<OpTaskInvocation>()) {
+      OpTaskInvocation op_task_invocation = attrs.get<OpTaskInvocation>();
+      std::vector<task_id_t> task_ids = get_task_ids(attrs);
+      for (task_id_t task_id : task_ids) {
+        TaskSignatureAndImpl task_signature_impl =
+            this->task_registry.task_mapping.at(task_id);
+        // TODO: multi gpu
+        register_wrapper_tasks(worker_procs[0], task_id, task_signature_impl);
+      }
     }
-    this->layer_optimizer_tensor_ids.insert({node, optimizer_tensors});
-    this->realm_tensor_backing.allocate_optimizer_tensors(
-        weight_tensor, optimizer_tensors, this->allocators[0]);
   }
+
+  // TODO: multi gpu
+  realm_tensor_backing = RealmTensorBacking(
+      allocated_tensors,
+      generate_unallocated_tensors_with_optimizer(
+          allocated_tensors, get_all_tensor_attrs(this->computation_graph),
+          this->gradient_tensor_source, this->optimizer_tensor_source,
+          optimizer_attrs),
+      allocators[0]);
+  realm_args_backing = initialize_args_backing(this, runtime_arg_config);
 }
 
-void RealmTrainingBacking::execute_init(layer_guid_t const &operator_node) {
-  if (registry_contains_task_for_layer(this->task_registry, operator_node,
-                                       OpTaskType::INIT)) {
-    ComputationGraphOpAttrs attrs =
-        get_layer_attrs(this->computation_graph, operator_node).attrs;
-    TaskInvocation invocation =
-        this->lower_to_task_invocation(init(attrs), operator_node);
-    TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation);
-    task_id_t task_id = invocation.task_id;
-    TaskImplFunction impl_function =
-        this->task_registry.task_mapping.at(task_id).impl_function;
-    // TODO: multi gpu launching
-    Promise<DeviceSpecificDeviceStates> promise(master_mem);
-    Future<DeviceSpecificDeviceStates> future = promise.get_future();
-    RealmTaskArgs<DeviceSpecificDeviceStates> args{
-        task_id, impl_function, accessor, std::move(promise)};
-    Event e = worker_procs[0].spawn(static_cast<Processor::TaskFuncID>(task_id),
-                                    &args, sizeof(args),
-                                    proc_events[worker_procs[0]]);
-    proc_events[worker_procs[0]] = e;
-    future.set_event(e);
-    this->realm_args_backing.add_per_device_op_state(operator_node,
-                                                     std::move(future.get()));
+RealmArgsBacking
+initialize_args_backing(RealmTrainingBacking *backing,
+                        RuntimeArgConfig const &runtime_arg_config) {
+  // initialize_args_backing(TaskRegistry const &task_registry,
+  //                         ComputationGraph const &cg,
+  //                         RuntimeArgConfig const &runtime_arg_config,
+  //                         RealmTensorBacking const &realm_tensor_backing) {
+  std::unordered_map<layer_guid_t, DeviceSpecificDeviceStates>
+      per_device_op_states;
+  TaskRegistry const &task_registry = backing->task_registry;
+  ComputationGraph const &cg = backing->computation_graph;
+  RealmTensorBacking const &realm_tensor_backing =
+      backing->realm_tensor_backing;
+  Processor master_proc = backing->master_proc;
+  Memory master_mem = backing->master_mem;
+  std::vector<Processor> &worker_procs = backing->worker_procs;
+  std::unordered_map<Processor, Event> &proc_events = backing->proc_events;
+
+  for (layer_guid_t const &node : topological_ordering(cg)) {
+    if (registry_contains_task_for_layer(task_registry, node,
+                                         OpTaskType::INIT)) {
+      ComputationGraphOpAttrs attrs = get_layer_attrs(cg, node).attrs;
+
+      TaskInvocation invocation = lower_to_task_invocation(
+          init(attrs), node, get_incoming_inputs(cg, node),
+          get_incoming_input_shapes(cg, node), get_outgoing_tensors(cg, node),
+          get_incoming_weights(cg, node),
+          realm_tensor_backing.tensor_gradient_mapping, std::nullopt);
+      TaskArgumentAccessor accessor = get_task_arg_accessor(
+          realm_tensor_backing,
+          make_args_backing_with_empty_device_states(runtime_arg_config),
+          invocation);
+      task_id_t task_id = invocation.task_id;
+      TaskImplFunction impl_function =
+          task_registry.task_mapping.at(task_id).impl_function;
+      // TODO: multi gpu launching
+      Promise<DeviceSpecificDeviceStates> promise(master_mem);
+      Future<DeviceSpecificDeviceStates> future = promise.get_future();
+      RealmTaskArgs<DeviceSpecificDeviceStates> args{
+          task_id, impl_function, accessor, std::move(promise)};
+      Event e = worker_procs[0].spawn(
+          static_cast<Processor::TaskFuncID>(task_id), &args, sizeof(args),
+          proc_events[worker_procs[0]]);
+      proc_events[worker_procs[0]] = e;
+      future.set_event(e);
+      per_device_op_states.insert({node, std::move(future.get())});
+    }
   }
+
+  return RealmArgsBacking{runtime_arg_config, per_device_op_states};
 }
 
+// void RealmTrainingBacking::register_and_allocate_layer(
+//     layer_guid_t const &node) {
+//   ComputationGraphOpAttrs attrs =
+//       get_layer_attrs(this->computation_graph, node).attrs;
+//   this->realm_tensor_backing.allocate_layer_tensors(
+//       node, this->computation_graph, this->allocators[0]);
+// }
+
+// void RealmTrainingBacking::allocate_layer_optimizer_tensors(
+//     layer_guid_t const &node, OptimizerAttrs const &optimizer_attrs) {
+//   ComputationGraphOpAttrs attrs =
+//       get_layer_attrs(this->computation_graph, node).attrs;
+//   if (attrs.has<WeightAttrs>()) {
+//     TaskSignature sig = get_update_signature(optimizer_attrs);
+//     tensor_guid_t weight_tensor =
+//         get_only(get_outgoing_tensors(this->computation_graph, node));
+
+//     std::vector<optimizer_tensor_t> optimizer_tensors;
+//     for (TensorTypeSlotSpec const &tensor_type_slot_spec :
+//          values(sig.tensor_guid_slots)) {
+//       optimizer_tensors.push_back(
+//           this->optimizer_tensor_source.new_optimizer_tensor());
+//     }
+//     this->layer_optimizer_tensor_ids.insert({node, optimizer_tensors});
+//     this->realm_tensor_backing.allocate_optimizer_tensors(
+//         weight_tensor, optimizer_tensors, this->allocators[0]);
+//   }
+// }
+
 Future<std::optional<float>>
-RealmTrainingBacking::execute_forward(layer_guid_t const &operator_node) {
-  if (registry_contains_task_for_layer(this->task_registry, operator_node,
-                                       OpTaskType::FWD)) {
+execute_forward(RealmTrainingBacking &realm_training_backing,
+                layer_guid_t const &operator_node) {
+  if (registry_contains_task_for_layer(realm_training_backing.task_registry,
+                                       operator_node, OpTaskType::FWD)) {
     ComputationGraphOpAttrs attrs =
-        get_layer_attrs(this->computation_graph, operator_node).attrs;
-    TaskInvocation invocation =
-        this->lower_to_task_invocation(forward(attrs), operator_node);
-    TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation);
+        get_layer_attrs(realm_training_backing.computation_graph, operator_node)
+            .attrs;
+    std::optional<DeviceSpecificDeviceStates> device_state =
+        get_per_device_op_state_if_exists(
+            realm_training_backing.realm_args_backing, operator_node);
+    TaskInvocation invocation = lower_to_task_invocation(
+        forward(attrs), operator_node,
+        get_incoming_inputs(realm_training_backing.computation_graph,
+                            operator_node),
+        get_incoming_input_shapes(realm_training_backing.computation_graph,
+                                  operator_node),
+        get_outgoing_tensors(realm_training_backing.computation_graph,
+                             operator_node),
+        get_incoming_weights(realm_training_backing.computation_graph,
+                             operator_node),
+        realm_training_backing.realm_tensor_backing.tensor_gradient_mapping,
+        device_state);
+    TaskArgumentAccessor accessor = get_task_arg_accessor(
+        realm_training_backing.realm_tensor_backing,
+        realm_training_backing.realm_args_backing, invocation);
     task_id_t task_id = invocation.task_id;
     TaskImplFunction impl_function =
-        this->task_registry.task_mapping.at(task_id).impl_function;
+        realm_training_backing.task_registry.task_mapping.at(task_id)
+            .impl_function;
     // TODO: multi gpu launching
-    Promise<std::optional<float>> promise(master_mem);
+    Promise<std::optional<float>> promise(realm_training_backing.master_mem);
     Future<std::optional<float>> future = promise.get_future();
     RealmTaskArgs<std::optional<float>> args{task_id, impl_function, accessor,
                                              std::move(promise)};
-    Event e = worker_procs[0].spawn(static_cast<Processor::TaskFuncID>(task_id),
-                                    &args, sizeof(args),
-                                    proc_events[worker_procs[0]]);
-    proc_events[worker_procs[0]] = e;
+    Event e = realm_training_backing.worker_procs[0].spawn(
+        static_cast<Processor::TaskFuncID>(task_id), &args, sizeof(args),
+        realm_training_backing
+            .proc_events[realm_training_backing.worker_procs[0]]);
+    realm_training_backing.proc_events[realm_training_backing.worker_procs[0]] =
+        e;
     future.set_event(e);
     return future;
   } else {
@@ -133,26 +254,46 @@ RealmTrainingBacking::execute_forward(layer_guid_t const &operator_node) {
 }
 
 Future<std::optional<float>>
-RealmTrainingBacking::execute_backward(layer_guid_t const &operator_node) {
-  if (registry_contains_task_for_layer(this->task_registry, operator_node,
-                                       OpTaskType::BWD)) {
+execute_backward(RealmTrainingBacking &realm_training_backing,
+                 layer_guid_t const &operator_node) {
+  if (registry_contains_task_for_layer(realm_training_backing.task_registry,
+                                       operator_node, OpTaskType::BWD)) {
     ComputationGraphOpAttrs attrs =
-        get_layer_attrs(this->computation_graph, operator_node).attrs;
-    TaskInvocation invocation =
-        this->lower_to_task_invocation(backward(attrs), operator_node);
-    TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation);
+        get_layer_attrs(realm_training_backing.computation_graph, operator_node)
+            .attrs;
+    std::optional<DeviceSpecificDeviceStates> device_state =
+        get_per_device_op_state_if_exists(
+            realm_training_backing.realm_args_backing, operator_node);
+    TaskInvocation invocation = lower_to_task_invocation(
+        forward(attrs), operator_node,
+        get_incoming_inputs(realm_training_backing.computation_graph,
+                            operator_node),
+        get_incoming_input_shapes(realm_training_backing.computation_graph,
+                                  operator_node),
+        get_outgoing_tensors(realm_training_backing.computation_graph,
+                             operator_node),
+        get_incoming_weights(realm_training_backing.computation_graph,
+                             operator_node),
+        realm_training_backing.realm_tensor_backing.tensor_gradient_mapping,
+        device_state);
+    TaskArgumentAccessor accessor = get_task_arg_accessor(
+        realm_training_backing.realm_tensor_backing,
+        realm_training_backing.realm_args_backing, invocation);
     task_id_t task_id = invocation.task_id;
     TaskImplFunction impl_function =
-        this->task_registry.task_mapping.at(task_id).impl_function;
+        realm_training_backing.task_registry.task_mapping.at(task_id)
+            .impl_function;
     // TODO: multi gpu launching
-    Promise<std::optional<float>> promise(master_mem);
+    Promise<std::optional<float>> promise(realm_training_backing.master_mem);
     Future<std::optional<float>> future = promise.get_future();
     RealmTaskArgs<std::optional<float>> args{task_id, impl_function, accessor,
                                              std::move(promise)};
-    Event e = worker_procs[0].spawn(static_cast<Processor::TaskFuncID>(task_id),
-                                    &args, sizeof(args),
-                                    proc_events[worker_procs[0]]);
-    proc_events[worker_procs[0]] = e;
+    Event e = realm_training_backing.worker_procs[0].spawn(
+        static_cast<Processor::TaskFuncID>(task_id), &args, sizeof(args),
+        realm_training_backing
+            .proc_events[realm_training_backing.worker_procs[0]]);
+    realm_training_backing.proc_events[realm_training_backing.worker_procs[0]] =
+        e;
     future.set_event(e);
     return future;
   } else {
@@ -160,34 +301,50 @@ RealmTrainingBacking::execute_backward(layer_guid_t const &operator_node) {
   }
 }
 
-Future<void>
-RealmTrainingBacking::execute_update(layer_guid_t const &node,
-                                     OptimizerAttrs const &optimizer_attrs) {
-  LayerAttrs layer_attrs = get_layer_attrs(this->computation_graph, node);
+Future<void> execute_update(RealmTrainingBacking const &realm_training_backing,
+                            layer_guid_t const &node,
+                            OptimizerAttrs const &optimizer_attrs) {
+  LayerAttrs layer_attrs =
+      get_layer_attrs(realm_training_backing.computation_graph, node);
   if (layer_attrs.attrs.has<WeightAttrs>()) {
     // get tensors
-    tensor_guid_t weight_tensor =
-        get_only(get_outgoing_tensors(this->computation_graph, node));
+    tensor_guid_t weight_tensor = get_only(
+        get_outgoing_tensors(realm_training_backing.computation_graph, node));
+
+    gradient_tensor_t weight_grad_tensor =
+        realm_training_backing.realm_tensor_backing.tensor_gradient_mapping.at(
+            weight_tensor);
     std::vector<optimizer_tensor_t> optimizer_buffer_tensors =
-        this->layer_optimizer_tensor_ids.at(node);
+        realm_training_backing.realm_tensor_backing.tensor_optimizer_mapping.at(
+            weight_tensor);
+
     // get invocation
-    TaskInvocation invocation = get_update_invocation(
-        optimizer_attrs, weight_tensor, optimizer_buffer_tensors);
+    TaskInvocation invocation =
+        get_update_invocation(optimizer_attrs, weight_tensor,
+                              weight_grad_tensor, optimizer_buffer_tensors);
+
     // TODO: https://github.com/flexflow/flexflow-train/issues/1442
     // assert(is_invocation_valid(get_update_signature(attrs), invocation));
-    TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation);
+
+    // execute update
+    TaskArgumentAccessor accessor = get_task_arg_accessor(
+        realm_training_backing.realm_tensor_backing,
+        realm_training_backing.realm_args_backing, invocation);
     task_id_t task_id = invocation.task_id;
-    register_wrapper_tasks_generic(worker_procs[0], task_id);
+    register_wrapper_tasks_generic(realm_training_backing.worker_procs[0],
+                                   task_id);
     TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs);
     // TODO: multi gpu launching
     Promise<void> promise;
     Future<void> future = promise.get_future();
     RealmTaskArgs<void> args{task_id, update_impl_fn, accessor,
                              std::move(promise)};
-    Event e = worker_procs[0].spawn(static_cast<Processor::TaskFuncID>(task_id),
-                                    &args, sizeof(args),
-                                    proc_events[worker_procs[0]]);
-    proc_events[worker_procs[0]] = e;
+    Event e = realm_training_backing.worker_procs[0].spawn(
+        static_cast<Processor::TaskFuncID>(task_id), &args, sizeof(args),
+        realm_training_backing
+            .proc_events[realm_training_backing.worker_procs[0]]);
+    realm_training_backing.proc_events[realm_training_backing.worker_procs[0]] =
+        e;
     future.set_event(e);
     return future;
   } else {
@@ -195,98 +352,50 @@ RealmTrainingBacking::execute_update(layer_guid_t const &node,
   }
 }
 
-Future<void>
-RealmTrainingBacking::compute_loss(LossAttrs const &loss_attrs,
-                                   tensor_guid_t const &logit_tensor,
-                                   loss_tensor_t const &label_tensor) {
-  TaskInvocation loss_invocation =
-      backward(loss_attrs, logit_tensor, label_tensor);
+Future<void> compute_loss(RealmTrainingBacking const &realm_training_backing,
+                          LossAttrs const &loss_attrs,
+                          tensor_guid_t const &logit_tensor,
+                          loss_tensor_t const &label_tensor) {
+  TaskInvocation loss_invocation = backward(
+      loss_attrs, logit_tensor,
+      realm_training_backing.realm_tensor_backing.tensor_gradient_mapping.at(
+          logit_tensor),
+      label_tensor);
   // TODO: https://github.com/flexflow/flexflow-train/issues/1442
   // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation));
-  TaskArgumentAccessor loss_accessor =
-      this->get_task_arg_accessor(loss_invocation);
+  TaskArgumentAccessor loss_accessor = get_task_arg_accessor(
+      realm_training_backing.realm_tensor_backing,
+      realm_training_backing.realm_args_backing, loss_invocation);
   task_id_t task_id = loss_invocation.task_id;
-  register_wrapper_tasks_generic(worker_procs[0], task_id);
+  register_wrapper_tasks_generic(realm_training_backing.worker_procs[0],
+                                 task_id);
   TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl();
   // TODO: multi gpu launching
   Promise<void> promise;
   Future<void> future = promise.get_future();
   RealmTaskArgs<void> args{task_id, loss_impl_fn, loss_accessor,
                            std::move(promise)};
-  Event e =
-      worker_procs[0].spawn(static_cast<Processor::TaskFuncID>(task_id), &args,
-                            sizeof(args), proc_events[worker_procs[0]]);
-  proc_events[worker_procs[0]] = e;
+  Event e = realm_training_backing.worker_procs[0].spawn(
+      static_cast<Processor::TaskFuncID>(task_id), &args, sizeof(args),
+      realm_training_backing
+          .proc_events[realm_training_backing.worker_procs[0]]);
+  realm_training_backing.proc_events[realm_training_backing.worker_procs[0]] =
+      e;
   future.set_event(e);
   return future;
 }
 
-TaskArgumentAccessor RealmTrainingBacking::get_task_arg_accessor(
-    TaskInvocation const &invocation) const {
+TaskArgumentAccessor
+get_task_arg_accessor(RealmTensorBacking const &realm_tensor_backing,
+                      RealmArgsBacking const &realm_args_backing,
+                      TaskInvocation const &invocation) {
   TensorSlotsBacking tensor_slots_backing =
-      this->realm_tensor_backing.construct_tensor_slots_backing(
-          invocation.binding);
-  ArgSlotsBacking arg_slots_backing =
-      this->realm_args_backing.construct_arg_slots_backing(invocation.binding);
+      construct_tensor_slots_backing(realm_tensor_backing, invocation.binding);
+  ArgSlotsBacking arg_slots_backing = construct_arg_slots_backing(
+      invocation.binding, realm_args_backing.runtime_arg_config);
+  // TODO: multi gpu
   return TaskArgumentAccessor::create<RealmTaskArgumentAccessor>(
-      this->allocators[0], tensor_slots_backing, arg_slots_backing);
-}
-
-TaskInvocation RealmTrainingBacking::lower_to_task_invocation(
-    OpTaskInvocation const &op_task_invocation,
-    layer_guid_t const &layer_guid) const {
-  TaskBinding binding;
-  // tensors
-  for (auto const &tensor_binding :
-       op_task_invocation.binding.get_tensor_bindings()) {
-    tensor_guid_t tensor_to_bind = [&]() -> tensor_guid_t {
-      switch (tensor_binding.second.role) {
-      case TensorRole::INPUT:
-        return get_incoming_inputs(this->computation_graph, layer_guid)
-            .at(tensor_binding.second.idx);
-      case TensorRole::OUTPUT:
-        return get_outgoing_tensors(this->computation_graph, layer_guid)
-            .at(tensor_binding.second.idx);
-      case TensorRole::WEIGHT:
-        return get_incoming_weights(this->computation_graph, layer_guid)
-            .at(tensor_binding.second.idx);
-      default:
-        throw mk_runtime_error(
-            fmt::format("Invalid tensor role {}", tensor_binding.second.role));
-      }
-    }();
-
-    if (tensor_binding.first.is_grad == IsGrad::NO) {
-      binding.bind(tensor_binding.first.slot_id, tensor_to_bind);
-    } else if (tensor_binding.first.is_grad == IsGrad::YES) {
-      binding.bind_grad(tensor_binding.first.slot_id, tensor_to_bind);
-    } else {
-      throw mk_runtime_error(fmt::format("Invalid value for IsGrad {}",
-                                         tensor_binding.first.is_grad));
-    }
-  }
-
-  // args
-  for (auto const &arg_binding :
-       op_task_invocation.binding.get_arg_bindings()) {
-    if (arg_binding.second.has<OpArgRefSpec>()) {
-      ConcreteArgSpec concrete_arg =
-          this->realm_args_backing.lower_to_concrete_arg_spec(
-              arg_binding.second.get<OpArgRefSpec>(), this->computation_graph,
-              layer_guid);
-      binding.insert_arg_spec(arg_binding.first, TaskArgSpec{concrete_arg});
-    } else if (arg_binding.second.has<RuntimeArgRefSpec>()) {
-      binding.insert_arg_spec(
-          arg_binding.first,
-          TaskArgSpec{arg_binding.second.get<RuntimeArgRefSpec>()});
-    } else {
-      binding.insert_arg_spec(
-          arg_binding.first,
-          TaskArgSpec{arg_binding.second.get<ConcreteArgSpec>()});
-    }
-  }
-
-  return TaskInvocation{op_task_invocation.task_id, binding};
+      realm_tensor_backing.allocator, tensor_slots_backing, arg_slots_backing);
 }
 
 } // namespace FlexFlow

From 89752fa904e5112f735f8957e1d7beb3bf1995f5 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Fri, 21 Feb 2025 21:28:39 -0800
Subject: [PATCH 47/91] Move local tensor backing to dtgen

---
 .../local_task_argument_accessor.h            |   9 -
 .../local-execution/local_tensor_backing.h    |  65 ++--
 .../local_tensor_backing.struct.toml          |  34 ++
 .../local-execution/local_training_backing.h  |  22 +-
 .../local-execution/model_training_instance.h |  13 +-
 .../local-execution/unallocated_tensors.h     |  27 ++
 .../src/local_cost_estimator.cc               |   6 +-
 .../src/local_task_argument_accessor.cc       |  31 --
 .../src/local_tensor_backing.cc               | 225 +++----------
 .../src/local_training_backing.cc             |  76 +++--
 .../src/model_training_instance.cc            |  49 ++-
 .../src/unallocated_tensors.cc                |  93 ++++++
 lib/local-execution/test/CMakeLists.txt       |   1 +
 .../test/src/test_local_slots_backing.cc      | 309 ------------------
 .../test/src/test_local_tensor_backing.cc     | 152 +++++++++
 .../test/src/test_task_registry.cc            |  69 ++++
 .../test/src/test_unallocated_tensors.cc      |   2 +-
 17 files changed, 546 insertions(+), 637 deletions(-)
 create mode 100644 lib/local-execution/include/local-execution/local_tensor_backing.struct.toml
 create mode 100644 lib/local-execution/include/local-execution/unallocated_tensors.h
 create mode 100644 lib/local-execution/src/unallocated_tensors.cc
 delete mode 100644 lib/local-execution/test/src/test_local_slots_backing.cc
 create mode 100644 lib/local-execution/test/src/test_local_tensor_backing.cc

diff --git a/lib/local-execution/include/local-execution/local_task_argument_accessor.h b/lib/local-execution/include/local-execution/local_task_argument_accessor.h
index b1e5a02985..c46534330b 100644
--- a/lib/local-execution/include/local-execution/local_task_argument_accessor.h
+++ b/lib/local-execution/include/local-execution/local_task_argument_accessor.h
@@ -39,15 +39,6 @@ struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor {
   ArgSlotsBacking arg_slots_backing;
 };
 
-using TensorSlotsBackingWithoutAddresses = std::unordered_map<
-    SlotTensorTypeId,
-    std::variant<std::pair<ArrayShape, DataType>,
-                 std::vector<std::pair<ArrayShape, DataType>>>>;
-
-TensorSlotsBackingWithoutAddresses
-    get_slots_backing_without_tensor_allocation_addresses(
-        TensorSlotsBacking const &);
-
 CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalTaskArgumentAccessor);
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/include/local-execution/local_tensor_backing.h b/lib/local-execution/include/local-execution/local_tensor_backing.h
index c05e39beae..70a2474159 100644
--- a/lib/local-execution/include/local-execution/local_tensor_backing.h
+++ b/lib/local-execution/include/local-execution/local_tensor_backing.h
@@ -6,6 +6,7 @@
 #include "local-execution/allocated_tensors.dtg.h"
 #include "local-execution/gradient_tensor_source.h"
 #include "local-execution/local_task_argument_accessor.h"
+#include "local-execution/local_tensor_backing.dtg.h"
 #include "local-execution/loss_tensor_source.h"
 #include "local-execution/lowered_tensor_source.h"
 #include "local-execution/optimizer_tensor_source.h"
@@ -19,51 +20,25 @@
 
 namespace FlexFlow {
 
-using TensorBackingMap =
-    std::unordered_map<lowered_tensor_t, GenericTensorAccessorW>;
-
-struct LocalTensorBacking {
-  LocalTensorBacking(AllocatedTensors const &,
-                     UnallocatedTensors const &,
-                     Allocator const &);
-
-public:
-  GenericTensorAccessorW get_tensor(TensorTypeVariant const &) const;
-
-public:
-  // tensors
-  TensorBackingMap tensor_backings;
-
-  std::unordered_map<tensor_guid_t, lowered_tensor_t> tensor_lowering_mapping;
-  std::unordered_map<gradient_tensor_t, lowered_tensor_t>
-      gradient_tensor_lowering_mapping;
-  std::unordered_map<optimizer_tensor_t, lowered_tensor_t>
-      optimizer_tensor_lowering_mapping;
-  std::unordered_map<loss_tensor_t, lowered_tensor_t>
-      loss_tensor_lowering_mapping;
-
-  std::unordered_map<tensor_guid_t, gradient_tensor_t> tensor_gradient_mapping;
-  std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
-      tensor_optimizer_mapping;
-
-  Allocator allocator;
-
-private:
-  lowered_tensor_t insert_tensor(TensorTypeVariant const &);
-  LoweredTensorSource lowered_tensor_source;
-};
-
-UnallocatedTensors generate_unallocated_tensors(
-    AllocatedTensors const &,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &,
-    GradientTensorSource &);
-
-UnallocatedTensors generate_unallocated_tensors_with_optimizer(
-    AllocatedTensors const &,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &,
-    GradientTensorSource &,
-    OptimizerTensorSource &,
-    OptimizerAttrs const &);
+GenericTensorAccessorW get_tensor(LocalTensorBacking const &,
+                                  TensorTypeVariant const &);
+
+std::unordered_map<TensorTypeVariant, GenericTensorAccessorW>
+    get_tensor_backings(
+        std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> const &,
+        std::unordered_map<TensorTypeVariant, TensorShape> const &,
+        Allocator &);
+
+std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
+    merge_optimizer_mappings(
+        std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>> const
+            &allocated,
+        std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>> const
+            &unallocated);
+
+LocalTensorBacking construct_local_tensor_backing(AllocatedTensors const &,
+                                                  UnallocatedTensors const &,
+                                                  Allocator &);
 
 TensorSlotsBacking construct_tensor_slots_backing(LocalTensorBacking const &,
                                                   TaskBinding const &);
diff --git a/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml b/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml
new file mode 100644
index 0000000000..c34063af5d
--- /dev/null
+++ b/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml
@@ -0,0 +1,34 @@
+namespace = "FlexFlow"
+name = "LocalTensorBacking"
+features = [
+  "eq",
+  "fmt",
+  "hash"
+]
+
+includes = [
+  "task-spec/tensor_type_t.dtg.h",
+  "kernels/accessor.h",
+  "pcg/tensor_guid_t.dtg.h",
+  "task-spec/gradient_tensor_t.dtg.h",
+  "task-spec/optimizer_tensor_t.dtg.h",
+]
+
+src_includes = [
+  "utils/hash/unordered_map.h",
+  "utils/fmt/unordered_map.h",
+  "utils/hash/vector.h",
+  "utils/fmt/vector.h",
+]
+
+[[fields]]
+name = "tensor_backings"
+type = "std::unordered_map<::FlexFlow::TensorTypeVariant, ::FlexFlow::GenericTensorAccessorW>"
+
+[[fields]]
+name = "tensor_gradient_mapping"
+type = "std::unordered_map<::FlexFlow::tensor_guid_t, ::FlexFlow::gradient_tensor_t>"
+
+[[fields]]
+name = "tensor_optimizer_mapping"
+type = "std::unordered_map<::FlexFlow::tensor_guid_t, std::vector<::FlexFlow::optimizer_tensor_t>>"
diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h
index b61d20c232..8c2bb34130 100644
--- a/lib/local-execution/include/local-execution/local_training_backing.h
+++ b/lib/local-execution/include/local-execution/local_training_backing.h
@@ -13,12 +13,12 @@
 namespace FlexFlow {
 
 struct LocalTrainingBacking {
-  LocalTrainingBacking(Allocator const &,
+  LocalTrainingBacking(Allocator &,
                        AllocatedTensors const &,
                        ComputationGraph const &,
                        RuntimeArgConfig const &);
 
-  LocalTrainingBacking(Allocator const &,
+  LocalTrainingBacking(Allocator &,
                        AllocatedTensors const &,
                        ComputationGraph const &,
                        RuntimeArgConfig const &,
@@ -38,27 +38,33 @@ struct LocalTrainingBacking {
 LocalArgsBacking initialize_args_backing(TaskRegistry const &,
                                          ComputationGraph const &,
                                          RuntimeArgConfig const &,
-                                         LocalTensorBacking const &);
+                                         LocalTensorBacking const &,
+                                         Allocator &);
 
 std::optional<float> call_task_impl(TaskRegistry const &,
                                     task_id_t const &task_id,
                                     TaskArgumentAccessor const &acc);
 
 std::optional<float> execute_forward(LocalTrainingBacking const &,
-                                     layer_guid_t const &);
+                                     layer_guid_t const &,
+                                     Allocator &);
 std::optional<float> execute_backward(LocalTrainingBacking const &,
-                                      layer_guid_t const &);
+                                      layer_guid_t const &,
+                                      Allocator &);
 void compute_loss(LocalTrainingBacking const &,
                   LossAttrs const &,
                   tensor_guid_t const &logit_tensor,
-                  loss_tensor_t const &label_tensor);
+                  loss_tensor_t const &label_tensor,
+                  Allocator &);
 void execute_update(LocalTrainingBacking const &,
                     layer_guid_t const &,
-                    OptimizerAttrs const &);
+                    OptimizerAttrs const &,
+                    Allocator &);
 
 TaskArgumentAccessor get_task_arg_accessor(LocalTensorBacking const &,
                                            LocalArgsBacking const &,
-                                           TaskInvocation const &);
+                                           TaskInvocation const &,
+                                           Allocator &);
 
 } // namespace FlexFlow
 
diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h
index 99a1bd5a9a..b36b20ed04 100644
--- a/lib/local-execution/include/local-execution/model_training_instance.h
+++ b/lib/local-execution/include/local-execution/model_training_instance.h
@@ -12,22 +12,25 @@ using PerLayerElapsedTime =
     std::unordered_map<layer_guid_t, std::optional<float>>;
 
 struct ModelTrainingInstance {
-  ModelTrainingInstance(LocalTrainingBacking const &,
+  ModelTrainingInstance(Allocator const &,
+                        LocalTrainingBacking const &,
                         tensor_guid_t const &logit_tensor,
                         loss_tensor_t const &label_tensor,
                         LossAttrs const &,
                         OptimizerAttrs const &);
 
+  Allocator allocator;
   LocalTrainingBacking training_backing;
   tensor_guid_t logit_tensor;
   loss_tensor_t label_tensor;
   LossAttrs loss_attrs;
   OptimizerAttrs optimizer_attrs;
-};
 
-PerLayerElapsedTime forward(ModelTrainingInstance const &);
-PerLayerElapsedTime backward(ModelTrainingInstance const &);
-void update(ModelTrainingInstance &);
+public:
+  PerLayerElapsedTime forward();
+  PerLayerElapsedTime backward();
+  void update();
+};
 
 } // namespace FlexFlow
 
diff --git a/lib/local-execution/include/local-execution/unallocated_tensors.h b/lib/local-execution/include/local-execution/unallocated_tensors.h
new file mode 100644
index 0000000000..63ead67589
--- /dev/null
+++ b/lib/local-execution/include/local-execution/unallocated_tensors.h
@@ -0,0 +1,27 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_UNALLOCATED_TENSORS_H
+#define _FLEXFLOW_LOCAL_EXECUTION_UNALLOCATED_TENSORS_H
+
+#include "local-execution/allocated_tensors.dtg.h"
+#include "local-execution/gradient_tensor_source.h"
+#include "local-execution/optimizer_tensor_source.h"
+#include "local-execution/unallocated_tensors.dtg.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "pcg/tensor_attrs.dtg.h"
+
+namespace FlexFlow {
+
+UnallocatedTensors generate_unallocated_tensors(
+    AllocatedTensors const &,
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &,
+    GradientTensorSource &);
+
+UnallocatedTensors generate_unallocated_tensors_with_optimizer(
+    AllocatedTensors const &,
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &,
+    GradientTensorSource &,
+    OptimizerTensorSource &,
+    OptimizerAttrs const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
index 41a5df8d48..7d05bb1e81 100644
--- a/lib/local-execution/src/local_cost_estimator.cc
+++ b/lib/local-execution/src/local_cost_estimator.cc
@@ -95,8 +95,10 @@ CostDetails LocalCostEstimator::estimate_cost(
   // execute layer
   layer_guid_t operator_layer_guid =
       get_layer_by_name(computation_graph, "operator");
-  float fwd = execute_forward(local_backing, operator_layer_guid).value();
-  float bwd = execute_backward(local_backing, operator_layer_guid).value();
+  float fwd =
+      execute_forward(local_backing, operator_layer_guid, allocator).value();
+  float bwd =
+      execute_backward(local_backing, operator_layer_guid, allocator).value();
 
   float total_execution_time = fwd + bwd;
 
diff --git a/lib/local-execution/src/local_task_argument_accessor.cc b/lib/local-execution/src/local_task_argument_accessor.cc
index fb6db2ed98..e53e3abeff 100644
--- a/lib/local-execution/src/local_task_argument_accessor.cc
+++ b/lib/local-execution/src/local_task_argument_accessor.cc
@@ -57,37 +57,6 @@ Allocator LocalTaskArgumentAccessor::get_allocator() const {
   return this->allocator;
 }
 
-TensorSlotsBackingWithoutAddresses
-    get_slots_backing_without_tensor_allocation_addresses(
-        TensorSlotsBacking const &slots_backing) {
-
-  TensorSlotsBackingWithoutAddresses addressless_slots_backing;
-
-  using TensorAccessorVariant =
-      std::variant<GenericTensorAccessorW, std::vector<GenericTensorAccessorW>>;
-  for (auto const &slot_tensor : slots_backing) {
-    TensorAccessorVariant accessor_variant = slot_tensor.second;
-    std::visit(
-        overload{
-            [&](GenericTensorAccessorW const &accessor) {
-              addressless_slots_backing.insert(
-                  {slot_tensor.first, get_shape_and_datatype(accessor)});
-            },
-            [&](std::vector<GenericTensorAccessorW> const &variadic_accessor) {
-              std::vector<std::pair<ArrayShape, DataType>>
-                  variadic_addressless_accessor =
-                      transform(variadic_accessor,
-                                [](GenericTensorAccessorW const &accessor) {
-                                  return get_shape_and_datatype(accessor);
-                                });
-              addressless_slots_backing.insert(
-                  {slot_tensor.first, variadic_addressless_accessor});
-            }},
-        accessor_variant);
-  }
-  return addressless_slots_backing;
-}
-
 size_t LocalTaskArgumentAccessor::get_device_idx() const {
   return 0;
 }
diff --git a/lib/local-execution/src/local_tensor_backing.cc b/lib/local-execution/src/local_tensor_backing.cc
index b5a0deaee4..629117508f 100644
--- a/lib/local-execution/src/local_tensor_backing.cc
+++ b/lib/local-execution/src/local_tensor_backing.cc
@@ -1,200 +1,81 @@
 #include "local-execution/local_tensor_backing.h"
-#include "task-spec/slot_grad_id.dtg.h"
-
-#include "local-execution/allocated_tensors.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "pcg/computation_graph.h"
 #include "pcg/optimizer_attrs.h"
+#include "task-spec/slot_grad_id.dtg.h"
 #include "utils/containers/contains_key.h"
 #include "utils/containers/keys.h"
 #include "utils/overload.h"
 
 namespace FlexFlow {
 
-LocalTensorBacking::LocalTensorBacking(
-    AllocatedTensors const &allocated_tensors,
-    UnallocatedTensors const &unallocated_tensors,
-    Allocator const &allocator)
-    : tensor_gradient_mapping(allocated_tensors.gradient_mapping),
-      tensor_optimizer_mapping(allocated_tensors.optimizer_mapping),
-      allocator(allocator) {
-
-  // handle already-allocated tensors
-  for (std::pair<TensorTypeVariant, GenericTensorAccessorW> const
-           &tensor_type_backing : allocated_tensors.tensor_type_backings) {
-    lowered_tensor_t lowered_tensor =
-        this->insert_tensor(tensor_type_backing.first);
-    this->tensor_backings.insert({lowered_tensor, tensor_type_backing.second});
-  }
-
-  // allocate new tensors
-  this->tensor_gradient_mapping.insert(
-      unallocated_tensors.gradient_mapping.begin(),
-      unallocated_tensors.gradient_mapping.end());
+GenericTensorAccessorW
+    get_tensor(LocalTensorBacking const &local_tensor_backing,
+               TensorTypeVariant const &tensor_type) {
+  return local_tensor_backing.tensor_backings.at(tensor_type);
+}
 
+std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
+    merge_optimizer_mappings(
+        std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>> const
+            &allocated,
+        std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>> const
+            &unallocated) {
+  std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
+      merged_maps = allocated;
   for (std::pair<tensor_guid_t, std::vector<optimizer_tensor_t>> const
-           &unallocated_optimizer_tensors :
-       unallocated_tensors.optimizer_mapping) {
-    if (this->tensor_optimizer_mapping.count(
-            unallocated_optimizer_tensors.first)) {
+           &unallocated_optimizer_tensors : unallocated) {
+    if (merged_maps.count(unallocated_optimizer_tensors.first)) {
       for (optimizer_tensor_t const &optimizer_tensor :
            unallocated_optimizer_tensors.second) {
-        this->tensor_optimizer_mapping[unallocated_optimizer_tensors.first]
-            .push_back(optimizer_tensor);
+        merged_maps[unallocated_optimizer_tensors.first].push_back(
+            optimizer_tensor);
       }
     } else {
-      this->tensor_optimizer_mapping.insert({unallocated_optimizer_tensors});
+      merged_maps.insert({unallocated_optimizer_tensors});
     }
   }
-
-  for (std::pair<TensorTypeVariant, TensorShape> const &tensor_type_shape :
-       unallocated_tensors.tensor_type_shapes) {
-    lowered_tensor_t lowered_tensor =
-        this->insert_tensor(tensor_type_shape.first);
-    GenericTensorAccessorW tensor_backing =
-        this->allocator.allocate_tensor(tensor_type_shape.second);
-    this->tensor_backings.insert({lowered_tensor, tensor_backing});
-  }
-};
-
-lowered_tensor_t
-    LocalTensorBacking::insert_tensor(TensorTypeVariant const &tensor_type) {
-  lowered_tensor_t lowered_tensor =
-      this->lowered_tensor_source.new_lowered_tensor();
-  tensor_type.visit<std::nullopt_t>(overload{
-      [&](tensor_guid_t const &tensor_guid) {
-        this->tensor_lowering_mapping.insert({tensor_guid, lowered_tensor});
-        return std::nullopt;
-      },
-      [&](gradient_tensor_t const &gradient_tensor) {
-        this->gradient_tensor_lowering_mapping.insert(
-            {gradient_tensor, lowered_tensor});
-        return std::nullopt;
-      },
-      [&](optimizer_tensor_t const &optimizer_tensor) {
-        this->optimizer_tensor_lowering_mapping.insert(
-            {optimizer_tensor, lowered_tensor});
-        return std::nullopt;
-      },
-      [&](loss_tensor_t const &loss_tensor) {
-        this->loss_tensor_lowering_mapping.insert(
-            {loss_tensor, lowered_tensor});
-        return std::nullopt;
-      },
-      [&](auto const &any_tensor) {
-        throw mk_runtime_error(
-            fmt::format("Unhandled tensor type {}", any_tensor));
-      }});
-  return lowered_tensor;
+  return merged_maps;
 }
 
-GenericTensorAccessorW
-    LocalTensorBacking::get_tensor(TensorTypeVariant const &tensor_type) const {
-  lowered_tensor_t lowered_tensor =
-      tensor_type.visit<lowered_tensor_t>(overload{
-          [&](tensor_guid_t const &tensor_guid) {
-            return this->tensor_lowering_mapping.at(tensor_guid);
-          },
-          [&](gradient_tensor_t const &gradient_tensor) {
-            return this->gradient_tensor_lowering_mapping.at(gradient_tensor);
-          },
-          [&](optimizer_tensor_t const &optimizer_tensor) {
-            return this->optimizer_tensor_lowering_mapping.at(optimizer_tensor);
-          },
-          [&](loss_tensor_t const &loss_tensor) {
-            return this->loss_tensor_lowering_mapping.at(loss_tensor);
-          },
-          [&](auto const &any_tensor) {
-            throw mk_runtime_error(
-                fmt::format("Unhandled tensor type {}", any_tensor));
-          }});
-  return this->tensor_backings.at(lowered_tensor);
-}
-
-UnallocatedTensors generate_unallocated_tensors(
-    AllocatedTensors const &allocated_tensors,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs_mapping,
-    GradientTensorSource &gradient_tensor_source) {
-
-  assert(are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping));
-
-  std::unordered_map<TensorTypeVariant, TensorShape> tensor_type_shapes;
-  std::unordered_map<tensor_guid_t, gradient_tensor_t> gradient_mapping;
+std::unordered_map<TensorTypeVariant, GenericTensorAccessorW>
+    get_tensor_backings(
+        std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> const
+            &tensor_type_backings,
+        std::unordered_map<TensorTypeVariant, TensorShape> const
+            &tensor_type_shapes,
+        Allocator &allocator) {
+  std::unordered_map<TensorTypeVariant, GenericTensorAccessorW>
+      all_tensor_backings = tensor_type_backings;
 
-  for (std::pair<tensor_guid_t, TensorAttrs> const &tensor_guid_attrs :
-       tensor_attrs_mapping) {
-    tensor_guid_t tensor_guid = tensor_guid_attrs.first;
-    TensorAttrs tensor_attrs = tensor_guid_attrs.second;
-    TensorTypeVariant tensor_guid_type = TensorTypeVariant{tensor_guid};
-    if (!allocated_tensors.tensor_type_backings.count(tensor_guid_type)) {
-      tensor_type_shapes.insert({tensor_guid_type, tensor_attrs.shape});
-    }
-
-    if (tensor_attrs.create_gradients == CreateGrad::YES &&
-        !allocated_tensors.gradient_mapping.count(tensor_guid)) {
-      gradient_tensor_t gradient_tensor =
-          gradient_tensor_source.new_gradient_tensor();
-      tensor_type_shapes.insert(
-          {TensorTypeVariant{gradient_tensor}, tensor_attrs.shape});
-      gradient_mapping.insert({tensor_guid, gradient_tensor});
-    }
+  // allocate new tensors
+  for (std::pair<TensorTypeVariant, TensorShape> const &tensor_type_shape :
+       tensor_type_shapes) {
+    GenericTensorAccessorW tensor_backing =
+        allocator.allocate_tensor(tensor_type_shape.second);
+    all_tensor_backings.insert({tensor_type_shape.first, tensor_backing});
   }
 
-  return UnallocatedTensors{tensor_type_shapes, gradient_mapping, {}};
+  return all_tensor_backings;
 }
 
-UnallocatedTensors generate_unallocated_tensors_with_optimizer(
+LocalTensorBacking construct_local_tensor_backing(
     AllocatedTensors const &allocated_tensors,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs_mapping,
-    GradientTensorSource &gradient_tensor_source,
-    OptimizerTensorSource &optimizer_tensor_source,
-    OptimizerAttrs const &optimizer_attrs) {
-
-  UnallocatedTensors unallocated_tensors = generate_unallocated_tensors(
-      allocated_tensors, tensor_attrs_mapping, gradient_tensor_source);
-
-  if (!get_num_optimizer_tensors(optimizer_attrs)) {
-    return unallocated_tensors;
-  }
-
-  std::unordered_map<TensorTypeVariant, TensorShape> tensor_type_shapes =
-      unallocated_tensors.tensor_type_shapes;
-  std::unordered_map<tensor_guid_t, gradient_tensor_t> gradient_mapping =
-      unallocated_tensors.gradient_mapping;
-  std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
-      optimizer_mapping;
-
-  for (std::pair<tensor_guid_t, TensorAttrs> const &tensor_guid_attrs :
-       tensor_attrs_mapping) {
-    tensor_guid_t tensor_guid = tensor_guid_attrs.first;
-    TensorAttrs tensor_attrs = tensor_guid_attrs.second;
-    if (tensor_attrs.create_gradients == CreateGrad::YES) {
-      std::vector<optimizer_tensor_t> optimizer_tensors;
-
-      int num_optimizer_tensors_to_allocate =
-          get_num_optimizer_tensors(optimizer_attrs);
-      if (allocated_tensors.optimizer_mapping.count(tensor_guid)) {
-        num_optimizer_tensors_to_allocate -=
-            allocated_tensors.optimizer_mapping.at(tensor_guid).size();
-      }
-      std::cout << num_optimizer_tensors_to_allocate;
-
-      for (int i = 0; i < num_optimizer_tensors_to_allocate; ++i) {
-        optimizer_tensor_t optimizer_tensor =
-            optimizer_tensor_source.new_optimizer_tensor();
-        optimizer_tensors.push_back(optimizer_tensor);
-        tensor_type_shapes.insert(
-            {TensorTypeVariant{optimizer_tensor}, tensor_attrs.shape});
-      }
-
-      if (num_optimizer_tensors_to_allocate > 0) {
-        optimizer_mapping.insert({tensor_guid, optimizer_tensors});
-      }
-    }
-  }
-
-  return UnallocatedTensors{
-      tensor_type_shapes, gradient_mapping, optimizer_mapping};
+    UnallocatedTensors const &unallocated_tensors,
+    Allocator &allocator) {
+
+  std::unordered_map<tensor_guid_t, gradient_tensor_t> merged_gradient_maps =
+      allocated_tensors.gradient_mapping;
+  merged_gradient_maps.insert(unallocated_tensors.gradient_mapping.begin(),
+                              unallocated_tensors.gradient_mapping.end());
+
+  return LocalTensorBacking{
+      get_tensor_backings(allocated_tensors.tensor_type_backings,
+                          unallocated_tensors.tensor_type_shapes,
+                          allocator),
+      merged_gradient_maps,
+      merge_optimizer_mappings(allocated_tensors.optimizer_mapping,
+                               unallocated_tensors.optimizer_mapping)};
 }
 
 TensorSlotsBacking construct_tensor_slots_backing(
@@ -205,7 +86,7 @@ TensorSlotsBacking construct_tensor_slots_backing(
   for (std::pair<SlotTensorTypeId, TensorTypeVariant> const &tensor_binding :
        binding.get_tensor_bindings()) {
     mapping.insert({tensor_binding.first,
-                    local_tensor_backing.get_tensor(tensor_binding.second)});
+                    get_tensor(local_tensor_backing, tensor_binding.second)});
   }
 
   return mapping;
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index f09234b920..576ab53859 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -2,6 +2,7 @@
 #include "local-execution/loss_functions.h"
 #include "local-execution/optimizer.h"
 #include "local-execution/task_signature_impl.h"
+#include "local-execution/unallocated_tensors.h"
 #include "pcg/computation_graph.h"
 #include "pcg/optimizer_attrs.h"
 #include "task-spec/op_task_to_task_invocation.h"
@@ -15,26 +16,28 @@
 namespace FlexFlow {
 
 LocalTrainingBacking::LocalTrainingBacking(
-    Allocator const &allocator,
+    Allocator &allocator,
     AllocatedTensors const &allocated_tensors,
     ComputationGraph const &computation_graph,
     RuntimeArgConfig const &runtime_arg_config)
     : computation_graph(computation_graph),
       task_registry(construct_task_registry(
           get_layer_attrs_mapping(this->computation_graph))),
-      local_tensor_backing(allocated_tensors,
-                           generate_unallocated_tensors(
-                               allocated_tensors,
-                               get_all_tensor_attrs(this->computation_graph),
-                               this->gradient_tensor_source),
-                           allocator),
+      local_tensor_backing(construct_local_tensor_backing(
+          allocated_tensors,
+          generate_unallocated_tensors(
+              allocated_tensors,
+              get_all_tensor_attrs(this->computation_graph),
+              this->gradient_tensor_source),
+          allocator)),
       local_args_backing(initialize_args_backing(this->task_registry,
                                                  this->computation_graph,
                                                  runtime_arg_config,
-                                                 this->local_tensor_backing)){};
+                                                 this->local_tensor_backing,
+                                                 allocator)){};
 
 LocalTrainingBacking::LocalTrainingBacking(
-    Allocator const &allocator,
+    Allocator &allocator,
     AllocatedTensors const &allocated_tensors,
     ComputationGraph const &computation_graph,
     RuntimeArgConfig const &runtime_arg_config,
@@ -42,24 +45,27 @@ LocalTrainingBacking::LocalTrainingBacking(
     : computation_graph(computation_graph),
       task_registry(construct_task_registry(
           get_layer_attrs_mapping(this->computation_graph))),
-      local_tensor_backing(allocated_tensors,
-                           generate_unallocated_tensors_with_optimizer(
-                               allocated_tensors,
-                               get_all_tensor_attrs(this->computation_graph),
-                               this->gradient_tensor_source,
-                               this->optimizer_tensor_source,
-                               optimizer_attrs),
-                           allocator),
+      local_tensor_backing(construct_local_tensor_backing(
+          allocated_tensors,
+          generate_unallocated_tensors_with_optimizer(
+              allocated_tensors,
+              get_all_tensor_attrs(this->computation_graph),
+              this->gradient_tensor_source,
+              this->optimizer_tensor_source,
+              optimizer_attrs),
+          allocator)),
       local_args_backing(initialize_args_backing(this->task_registry,
                                                  this->computation_graph,
                                                  runtime_arg_config,
-                                                 this->local_tensor_backing)){};
+                                                 this->local_tensor_backing,
+                                                 allocator)){};
 
 LocalArgsBacking
     initialize_args_backing(TaskRegistry const &task_registry,
                             ComputationGraph const &cg,
                             RuntimeArgConfig const &runtime_arg_config,
-                            LocalTensorBacking const &local_tensor_backing) {
+                            LocalTensorBacking const &local_tensor_backing,
+                            Allocator &allocator) {
   std::unordered_map<layer_guid_t, DeviceSpecificDeviceStates>
       per_device_op_states;
   for (layer_guid_t const &node : topological_ordering(cg)) {
@@ -79,7 +85,8 @@ LocalArgsBacking
       TaskArgumentAccessor accessor = get_task_arg_accessor(
           local_tensor_backing,
           make_args_backing_with_empty_device_states(runtime_arg_config),
-          invocation);
+          invocation,
+          allocator);
       TaskSignatureAndImpl task_sig_impl =
           task_registry.task_mapping.at(invocation.task_id);
       auto fn = task_sig_impl.impl_function.get<InitOpTaskImplFunction>()
@@ -103,7 +110,8 @@ std::optional<float> call_task_impl(TaskRegistry const &task_registry,
 
 std::optional<float>
     execute_forward(LocalTrainingBacking const &local_training_backing,
-                    layer_guid_t const &operator_node) {
+                    layer_guid_t const &operator_node,
+                    Allocator &allocator) {
   if (registry_contains_task_for_layer(local_training_backing.task_registry,
                                        operator_node,
                                        OpTaskType::FWD)) {
@@ -130,7 +138,8 @@ std::optional<float>
     TaskArgumentAccessor accessor =
         get_task_arg_accessor(local_training_backing.local_tensor_backing,
                               local_training_backing.local_args_backing,
-                              invocation);
+                              invocation,
+                              allocator);
     return call_task_impl(
         local_training_backing.task_registry, invocation.task_id, accessor);
   } else {
@@ -141,7 +150,8 @@ std::optional<float>
 void compute_loss(LocalTrainingBacking const &local_training_backing,
                   LossAttrs const &loss_attrs,
                   tensor_guid_t const &logit_tensor,
-                  loss_tensor_t const &label_tensor) {
+                  loss_tensor_t const &label_tensor,
+                  Allocator &allocator) {
   TaskInvocation loss_invocation = backward(
       loss_attrs,
       logit_tensor,
@@ -153,14 +163,16 @@ void compute_loss(LocalTrainingBacking const &local_training_backing,
   TaskArgumentAccessor loss_accessor =
       get_task_arg_accessor(local_training_backing.local_tensor_backing,
                             local_training_backing.local_args_backing,
-                            loss_invocation);
+                            loss_invocation,
+                            allocator);
   TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl();
   loss_impl_fn.get<GenericTaskImplFunction>().function_ptr(loss_accessor);
 }
 
 std::optional<float>
     execute_backward(LocalTrainingBacking const &local_training_backing,
-                     layer_guid_t const &operator_node) {
+                     layer_guid_t const &operator_node,
+                     Allocator &allocator) {
   if (registry_contains_task_for_layer(local_training_backing.task_registry,
                                        operator_node,
                                        OpTaskType::BWD)) {
@@ -187,7 +199,8 @@ std::optional<float>
     TaskArgumentAccessor accessor =
         get_task_arg_accessor(local_training_backing.local_tensor_backing,
                               local_training_backing.local_args_backing,
-                              invocation);
+                              invocation,
+                              allocator);
     return call_task_impl(
         local_training_backing.task_registry, invocation.task_id, accessor);
   } else {
@@ -197,7 +210,8 @@ std::optional<float>
 
 void execute_update(LocalTrainingBacking const &local_training_backing,
                     layer_guid_t const &node,
-                    OptimizerAttrs const &optimizer_attrs) {
+                    OptimizerAttrs const &optimizer_attrs,
+                    Allocator &allocator) {
   LayerAttrs layer_attrs =
       get_layer_attrs(local_training_backing.computation_graph, node);
   if (layer_attrs.attrs.has<WeightAttrs>()) {
@@ -225,7 +239,8 @@ void execute_update(LocalTrainingBacking const &local_training_backing,
     TaskArgumentAccessor accessor =
         get_task_arg_accessor(local_training_backing.local_tensor_backing,
                               local_training_backing.local_args_backing,
-                              invocation);
+                              invocation,
+                              allocator);
     TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs);
     update_impl_fn.get<GenericTaskImplFunction>().function_ptr(accessor);
   }
@@ -234,13 +249,14 @@ void execute_update(LocalTrainingBacking const &local_training_backing,
 TaskArgumentAccessor
     get_task_arg_accessor(LocalTensorBacking const &local_tensor_backing,
                           LocalArgsBacking const &local_args_backing,
-                          TaskInvocation const &invocation) {
+                          TaskInvocation const &invocation,
+                          Allocator &allocator) {
   TensorSlotsBacking tensor_slots_backing =
       construct_tensor_slots_backing(local_tensor_backing, invocation.binding);
   ArgSlotsBacking arg_slots_backing = construct_arg_slots_backing(
       invocation.binding, local_args_backing.runtime_arg_config);
   return TaskArgumentAccessor::create<LocalTaskArgumentAccessor>(
-      local_tensor_backing.allocator, tensor_slots_backing, arg_slots_backing);
+      allocator, tensor_slots_backing, arg_slots_backing);
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc
index 4a22937174..d404221d88 100644
--- a/lib/local-execution/src/model_training_instance.cc
+++ b/lib/local-execution/src/model_training_instance.cc
@@ -6,53 +6,52 @@
 namespace FlexFlow {
 
 ModelTrainingInstance::ModelTrainingInstance(
+    Allocator const &allocator,
     LocalTrainingBacking const &local_training_backing,
     tensor_guid_t const &logit_tensor,
     loss_tensor_t const &label_tensor,
     LossAttrs const &loss_attrs,
     OptimizerAttrs const &optimizer_attrs)
-    : training_backing(local_training_backing), loss_attrs(loss_attrs),
-      optimizer_attrs(optimizer_attrs), logit_tensor(logit_tensor),
-      label_tensor(label_tensor){};
+    : allocator(allocator), training_backing(local_training_backing),
+      loss_attrs(loss_attrs), optimizer_attrs(optimizer_attrs),
+      logit_tensor(logit_tensor), label_tensor(label_tensor){};
 
-PerLayerElapsedTime
-    forward(ModelTrainingInstance const &model_training_instance) {
+PerLayerElapsedTime ModelTrainingInstance::forward() {
   PerLayerElapsedTime per_layer_elapsed_time;
-  for (layer_guid_t const &node : topological_ordering(
-           model_training_instance.training_backing.computation_graph)) {
+  for (layer_guid_t const &node :
+       topological_ordering(this->training_backing.computation_graph)) {
     std::optional<float> elapsed_time =
-        execute_forward(model_training_instance.training_backing, node);
+        execute_forward(this->training_backing, node, this->allocator);
     per_layer_elapsed_time.insert({node, elapsed_time});
   }
   return per_layer_elapsed_time;
 }
 
-PerLayerElapsedTime
-    backward(ModelTrainingInstance const &model_training_instance) {
-  compute_loss(model_training_instance.training_backing,
-               model_training_instance.loss_attrs,
-               model_training_instance.logit_tensor,
-               model_training_instance.label_tensor);
+PerLayerElapsedTime ModelTrainingInstance::backward() {
+  compute_loss(this->training_backing,
+               this->loss_attrs,
+               this->logit_tensor,
+               this->label_tensor,
+               this->allocator);
 
   PerLayerElapsedTime per_layer_elapsed_time;
-  for (layer_guid_t const &node : reversed(topological_ordering(
-           model_training_instance.training_backing.computation_graph))) {
+  for (layer_guid_t const &node : reversed(
+           topological_ordering(this->training_backing.computation_graph))) {
     std::optional<float> elapsed_time =
-        execute_backward(model_training_instance.training_backing, node);
+        execute_backward(this->training_backing, node, this->allocator);
     per_layer_elapsed_time.insert({node, elapsed_time});
   }
   return per_layer_elapsed_time;
 }
 
-void update(ModelTrainingInstance &model_training_instance) {
-  for (layer_guid_t const &node : topological_ordering(
-           model_training_instance.training_backing.computation_graph)) {
-    execute_update(model_training_instance.training_backing,
-                   node,
-                   model_training_instance.optimizer_attrs);
+void ModelTrainingInstance::update() {
+  for (layer_guid_t const &node :
+       topological_ordering(this->training_backing.computation_graph)) {
+    execute_update(
+        this->training_backing, node, this->optimizer_attrs, this->allocator);
   }
-  model_training_instance.optimizer_attrs = get_optimizer_attrs_for_next_iter(
-      model_training_instance.optimizer_attrs);
+  this->optimizer_attrs =
+      get_optimizer_attrs_for_next_iter(this->optimizer_attrs);
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/unallocated_tensors.cc b/lib/local-execution/src/unallocated_tensors.cc
new file mode 100644
index 0000000000..ea64a46051
--- /dev/null
+++ b/lib/local-execution/src/unallocated_tensors.cc
@@ -0,0 +1,93 @@
+#include "local-execution/unallocated_tensors.h"
+#include "local-execution/allocated_tensors.h"
+#include "pcg/optimizer_attrs.h"
+
+namespace FlexFlow {
+
+UnallocatedTensors generate_unallocated_tensors(
+    AllocatedTensors const &allocated_tensors,
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs_mapping,
+    GradientTensorSource &gradient_tensor_source) {
+
+  assert(are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping));
+
+  std::unordered_map<TensorTypeVariant, TensorShape> tensor_type_shapes;
+  std::unordered_map<tensor_guid_t, gradient_tensor_t> gradient_mapping;
+
+  for (std::pair<tensor_guid_t, TensorAttrs> const &tensor_guid_attrs :
+       tensor_attrs_mapping) {
+    tensor_guid_t tensor_guid = tensor_guid_attrs.first;
+    TensorAttrs tensor_attrs = tensor_guid_attrs.second;
+    TensorTypeVariant tensor_guid_type = TensorTypeVariant{tensor_guid};
+    if (!allocated_tensors.tensor_type_backings.count(tensor_guid_type)) {
+      tensor_type_shapes.insert({tensor_guid_type, tensor_attrs.shape});
+    }
+
+    if (tensor_attrs.create_gradients == CreateGrad::YES &&
+        !allocated_tensors.gradient_mapping.count(tensor_guid)) {
+      gradient_tensor_t gradient_tensor =
+          gradient_tensor_source.new_gradient_tensor();
+      tensor_type_shapes.insert(
+          {TensorTypeVariant{gradient_tensor}, tensor_attrs.shape});
+      gradient_mapping.insert({tensor_guid, gradient_tensor});
+    }
+  }
+
+  return UnallocatedTensors{tensor_type_shapes, gradient_mapping, {}};
+}
+
+UnallocatedTensors generate_unallocated_tensors_with_optimizer(
+    AllocatedTensors const &allocated_tensors,
+    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs_mapping,
+    GradientTensorSource &gradient_tensor_source,
+    OptimizerTensorSource &optimizer_tensor_source,
+    OptimizerAttrs const &optimizer_attrs) {
+
+  UnallocatedTensors unallocated_tensors = generate_unallocated_tensors(
+      allocated_tensors, tensor_attrs_mapping, gradient_tensor_source);
+
+  if (!get_num_optimizer_tensors(optimizer_attrs)) {
+    return unallocated_tensors;
+  }
+
+  std::unordered_map<TensorTypeVariant, TensorShape> tensor_type_shapes =
+      unallocated_tensors.tensor_type_shapes;
+  std::unordered_map<tensor_guid_t, gradient_tensor_t> gradient_mapping =
+      unallocated_tensors.gradient_mapping;
+  std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
+      optimizer_mapping;
+
+  for (std::pair<tensor_guid_t, TensorAttrs> const &tensor_guid_attrs :
+       tensor_attrs_mapping) {
+    tensor_guid_t tensor_guid = tensor_guid_attrs.first;
+    TensorAttrs tensor_attrs = tensor_guid_attrs.second;
+    if (tensor_attrs.create_gradients == CreateGrad::YES) {
+      std::vector<optimizer_tensor_t> optimizer_tensors;
+
+      int num_optimizer_tensors_to_allocate =
+          get_num_optimizer_tensors(optimizer_attrs);
+      if (allocated_tensors.optimizer_mapping.count(tensor_guid)) {
+        num_optimizer_tensors_to_allocate -=
+            allocated_tensors.optimizer_mapping.at(tensor_guid).size();
+      }
+      std::cout << num_optimizer_tensors_to_allocate;
+
+      for (int i = 0; i < num_optimizer_tensors_to_allocate; ++i) {
+        optimizer_tensor_t optimizer_tensor =
+            optimizer_tensor_source.new_optimizer_tensor();
+        optimizer_tensors.push_back(optimizer_tensor);
+        tensor_type_shapes.insert(
+            {TensorTypeVariant{optimizer_tensor}, tensor_attrs.shape});
+      }
+
+      if (num_optimizer_tensors_to_allocate > 0) {
+        optimizer_mapping.insert({tensor_guid, optimizer_tensors});
+      }
+    }
+  }
+
+  return UnallocatedTensors{
+      tensor_type_shapes, gradient_mapping, optimizer_mapping};
+}
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/test/CMakeLists.txt b/lib/local-execution/test/CMakeLists.txt
index fc647cff9b..4bcb37ea48 100644
--- a/lib/local-execution/test/CMakeLists.txt
+++ b/lib/local-execution/test/CMakeLists.txt
@@ -7,6 +7,7 @@ ff_add_test_executable(
     src/test_task_registry.cc
     src/test_utils.cc
     src/test_local_task_arg_accessor.cc
+    src/test_local_tensor_backing.cc
   PRIVATE_INCLUDE 
     src/
   DEPS
diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc
deleted file mode 100644
index e5ca58bc1f..0000000000
--- a/lib/local-execution/test/src/test_local_slots_backing.cc
+++ /dev/null
@@ -1,309 +0,0 @@
-#include "kernels/attention_kernels.h"
-#include "local-execution/local_cost_estimator.h"
-#include "local-execution/local_cpu_allocator.h"
-#include "local-execution/local_tensor_backing.h"
-
-#include "op-attrs/ops/attention.h"
-#include "op-attrs/parallel_tensor_shape.h"
-#include "pcg/computation_graph.h"
-#include "pcg/computation_graph_builder.h"
-#include "test/utils/doctest/fmt/pair.h"
-#include "test/utils/doctest/fmt/unordered_map.h"
-#include "test/utils/doctest/fmt/variant.h"
-#include "test/utils/doctest/fmt/vector.h"
-#include "test_utils.h"
-#include "utils/containers/get_only.h"
-#include <doctest/doctest.h>
-
-using namespace ::FlexFlow;
-
-TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("LocalTensorBacking -- Attention Op") {
-    // allocate input memory
-    Allocator allocator = create_local_cpu_memory_allocator();
-    nonnegative_int embed_dim = 32_n;
-    nonnegative_int num_heads = 10_n;
-
-    nonnegative_int batch_size = 40_n;
-    nonnegative_int seq_len = 48_n;
-    nonnegative_int feature_size = 36_n;
-
-    DataType dtype = DataType::FLOAT;
-    TensorShape input_tensor_shape = TensorShape{
-        TensorDims{
-            FFOrdered<nonnegative_int>{batch_size, seq_len, feature_size}},
-        DataType::FLOAT,
-    };
-    TensorShape query_shape = input_tensor_shape;
-    TensorShape key_shape = input_tensor_shape;
-    TensorShape value_shape = input_tensor_shape;
-    GenericTensorAccessorW query = allocator.allocate_tensor(query_shape);
-    GenericTensorAccessorW key = allocator.allocate_tensor(key_shape);
-    GenericTensorAccessorW value = allocator.allocate_tensor(value_shape);
-
-    // build graph
-    ComputationGraphBuilder cg_builder;
-    tensor_guid_t query_guid =
-        cg_builder.create_input(query_shape, CreateGrad::YES);
-    tensor_guid_t key_guid =
-        cg_builder.create_input(key_shape, CreateGrad::YES);
-    tensor_guid_t value_guid =
-        cg_builder.create_input(value_shape, CreateGrad::YES);
-
-    std::string layer_name = "attn1";
-    tensor_guid_t output_guid =
-        cg_builder.multihead_attention(query_guid,
-                                       key_guid,
-                                       value_guid,
-                                       embed_dim,
-                                       num_heads,
-                                       /*kdim=*/embed_dim,
-                                       /*vdim=*/embed_dim,
-                                       /*dropout=*/0.0f,
-                                       /*bias=*/true,
-                                       /*add_bias_kv=*/false,
-                                       /*add_zero_attn=*/false,
-                                       /*initializer=*/std::nullopt,
-                                       /*maybe_name=*/layer_name);
-
-    layer_guid_t layer_guid =
-        get_layer_by_name(cg_builder.computation_graph, layer_name);
-
-    LayerTensorBackingMap layer_tensor_backing_map = {
-        {LayerTensorKey{layer_guid, lower(query_guid)}, query},
-        {LayerTensorKey{layer_guid, lower(key_guid)}, key},
-        {LayerTensorKey{layer_guid, lower(value_guid)}, value},
-        //{LayerTensorKey{layer_guid, lower(output_guid), output}}
-    };
-
-    // runtime arg config
-    ProfilingSettings settings = ProfilingSettings{/*warmup_iters=*/0,
-                                                   /*measure_iters=*/0};
-    PerDeviceFFHandle handle = get_mock_per_device_ff_handle();
-    RuntimeArgConfig runtime_arg_config =
-        RuntimeArgConfig{DeviceSpecific<PerDeviceFFHandle>::create(handle),
-                         EnableProfiling::NO,
-                         settings};
-
-    LocalTensorBacking local_tensor_backing = {
-        layer_tensor_backing_map, TensorBackingMap{}, runtime_arg_config};
-
-    SUBCASE("LocalTensorBacking::allocate_tensors_by_role") {
-      auto get_result_shape_and_dtype_for_tensor_guid_and_map =
-          [&](tensor_guid_t t,
-              layer_guid_t l,
-              LayerTensorBackingMap m) -> std::pair<ArrayShape, DataType> {
-        GenericTensorAccessorW accessor = m.at(LayerTensorKey{l, lower(t)});
-        return get_shape_and_datatype(accessor);
-      };
-
-      SUBCASE("Input (QKV) and gradient tensors allocation") {
-
-        // allocate all tensors from input nodes
-        local_tensor_backing.allocate_tensors_by_role(
-            TensorRole::INPUT,
-            layer_guid,
-            cg_builder.computation_graph,
-            allocator);
-
-        SUBCASE("Query grad") {
-          std::pair<ArrayShape, DataType> result =
-              get_result_shape_and_dtype_for_tensor_guid_and_map(
-                  query_guid,
-                  layer_guid,
-                  local_tensor_backing.gradient_tensor_mapping);
-          std::pair<ArrayShape, DataType> correct = {ArrayShape{query_shape},
-                                                     dtype};
-          CHECK(result == correct);
-        }
-        SUBCASE("Key grad") {
-          std::pair<ArrayShape, DataType> result =
-              get_result_shape_and_dtype_for_tensor_guid_and_map(
-                  key_guid,
-                  layer_guid,
-                  local_tensor_backing.gradient_tensor_mapping);
-          std::pair<ArrayShape, DataType> correct = {ArrayShape{key_shape},
-                                                     dtype};
-          CHECK(result == correct);
-        }
-        SUBCASE("Value grad") {
-          std::pair<ArrayShape, DataType> result =
-              get_result_shape_and_dtype_for_tensor_guid_and_map(
-                  value_guid,
-                  layer_guid,
-                  local_tensor_backing.gradient_tensor_mapping);
-          std::pair<ArrayShape, DataType> correct = {ArrayShape{value_shape},
-                                                     dtype};
-          CHECK(result == correct);
-        }
-      }
-      SUBCASE("Output and gradient tensors allocation") {
-        local_tensor_backing.allocate_tensors_by_role(
-            TensorRole::OUTPUT,
-            layer_guid,
-            cg_builder.computation_graph,
-            allocator);
-        SUBCASE("Output") {
-          std::pair<ArrayShape, DataType> result =
-              get_result_shape_and_dtype_for_tensor_guid_and_map(
-                  output_guid, layer_guid, local_tensor_backing.tensor_mapping);
-          std::pair<ArrayShape, DataType> correct = {
-              ArrayShape{
-                  get_tensor_attrs(cg_builder.computation_graph, output_guid)
-                      .shape},
-              dtype};
-          CHECK(result == correct);
-        }
-        SUBCASE("Output grad") {
-          std::pair<ArrayShape, DataType> result =
-              get_result_shape_and_dtype_for_tensor_guid_and_map(
-                  output_guid,
-                  layer_guid,
-                  local_tensor_backing.gradient_tensor_mapping);
-          std::pair<ArrayShape, DataType> correct = {
-              ArrayShape{
-                  get_tensor_attrs(cg_builder.computation_graph, output_guid)
-                      .shape},
-              dtype};
-          CHECK(result == correct);
-        }
-      }
-
-      SUBCASE("Tensor slots") {
-        local_tensor_backing.allocate_layer_tensors(
-            layer_guid, cg_builder.computation_graph, allocator);
-        SUBCASE("Input tensor slots") {
-          std::vector<lowered_tensor_t> correct_incoming_input_tensors =
-              transform(
-                  get_incoming_inputs(cg_builder.computation_graph, layer_guid),
-                  [&](tensor_guid_t const &tensor_guid) {
-                    return lower(tensor_guid);
-                  });
-          CHECK(correct_incoming_input_tensors ==
-                local_tensor_backing.input_tensor_slots.at(layer_guid));
-        }
-        SUBCASE("Weight tensor slots") {
-          std::vector<lowered_tensor_t> correct_incoming_weight_tensors =
-              transform(get_incoming_weights(cg_builder.computation_graph,
-                                             layer_guid),
-                        [&](tensor_guid_t const &tensor_guid) {
-                          return lower(tensor_guid);
-                        });
-          CHECK(correct_incoming_weight_tensors ==
-                local_tensor_backing.weight_tensor_slots.at(layer_guid));
-        }
-        SUBCASE("Output tensor slots") {
-          std::vector<lowered_tensor_t> correct_output_tensors = transform(
-              get_outgoing_tensors(cg_builder.computation_graph, layer_guid),
-              [&](tensor_guid_t const &tensor_guid) {
-                return lower(tensor_guid);
-              });
-          CHECK(correct_output_tensors ==
-                local_tensor_backing.output_tensor_slots.at(layer_guid));
-        }
-      }
-    }
-
-    SUBCASE("Construct Slots Backings") {
-      enum Slots {
-        QUERY,
-        KEY,
-        VALUE,
-        WEIGHTS,
-        OUTPUT,
-        QUERY_PARALLEL_TENSOR_SHAPE,
-        QPROJSIZE,
-        ATTRS,
-        PROFILING,
-        HANDLE,
-      };
-      MultiHeadAttentionAttrs attrs =
-          get_layer_attrs(cg_builder.computation_graph, layer_guid)
-              .attrs.get<MultiHeadAttentionAttrs>();
-      OpTaskBinding binding = [&] {
-        OpTaskBinding b;
-        b.bind(QUERY, input_tensor(0));
-        b.bind(KEY, input_tensor(1));
-        b.bind(VALUE, input_tensor(2));
-        b.bind(WEIGHTS, weight_tensor(0));
-        b.bind(OUTPUT, output_tensor(0));
-
-        b.bind_grad(QUERY, input_tensor(0));
-
-        b.bind_arg(QPROJSIZE, get_qProjSize(attrs));
-        b.bind_arg(ATTRS, attrs);
-        b.bind_arg(QUERY_PARALLEL_TENSOR_SHAPE, input_parallel_tensor_shape(0));
-        b.bind_arg(PROFILING, profiling_settings());
-        b.bind_arg(HANDLE, ff_handle());
-        return b;
-      }();
-
-      local_tensor_backing.allocate_layer_tensors(
-          layer_guid, cg_builder.computation_graph, allocator);
-
-      SUBCASE("LocalTensorBacking::construct_tensor_slots_backing") {
-        TensorSlotsBackingWithoutAddresses result =
-            get_slots_backing_without_tensor_allocation_addresses(
-                local_tensor_backing.construct_tensor_slots_backing(
-                    binding, layer_guid));
-        TensorSlotsBackingWithoutAddresses correct = [&] {
-          TensorShape weights_shape = throw_if_unexpected(
-              get_weights_shape(attrs, query_shape, key_shape, value_shape));
-          GenericTensorAccessorW weights =
-              allocator.allocate_tensor(weights_shape);
-
-          TensorAttrs output_attrs =
-              get_tensor_attrs(cg_builder.computation_graph, output_guid);
-          GenericTensorAccessorW output =
-              allocator.allocate_tensor(output_attrs.shape);
-          return get_slots_backing_without_tensor_allocation_addresses(
-              TensorSlotsBacking{
-                  {SlotTensorTypeId{slot_id_t{QUERY}, TensorType::FORWARD},
-                   query},
-                  {SlotTensorTypeId{slot_id_t{KEY}, TensorType::FORWARD}, key},
-                  {SlotTensorTypeId{slot_id_t{VALUE}, TensorType::FORWARD},
-                   value},
-                  {SlotTensorTypeId{slot_id_t{WEIGHTS}, TensorType::FORWARD},
-                   weights},
-                  {SlotTensorTypeId{slot_id_t{OUTPUT}, TensorType::FORWARD},
-                   output},
-                  {SlotTensorTypeId{slot_id_t{QUERY}, TensorType::GRADIENT},
-                   query}});
-        }();
-
-        CHECK(result == correct);
-      }
-      SUBCASE("LocalTensorBacking::construct_arg_slots_backing") {
-        ArgSlotsBacking result =
-            local_tensor_backing.construct_arg_slots_backing(binding,
-                                                             layer_guid);
-
-        ArgSlotsBacking correct = [&] {
-          ParallelTensorShape query_parallel_tensor_shape =
-              lift_to_parallel(query_shape);
-
-          return ArgSlotsBacking{
-              {slot_id_t{QPROJSIZE},
-               ConcreteArgSpec::create(get_qProjSize(attrs))},
-              {slot_id_t{ATTRS}, ConcreteArgSpec::create(attrs)},
-              {slot_id_t{QUERY_PARALLEL_TENSOR_SHAPE},
-               ConcreteArgSpec::create(query_parallel_tensor_shape)},
-              {slot_id_t{PROFILING},
-               ConcreteArgSpec::create(runtime_arg_config.profiling_settings)},
-              {slot_id_t{HANDLE}, ConcreteArgSpec::create(handle)}};
-        }();
-
-        CHECK(result == correct);
-      }
-
-      SUBCASE("LocalTensorBacking::resolve_runtime_arg_ref_spec") {
-        RuntimeArgRefSpec ref_spec = RuntimeArgRefSpec::create(ff_handle());
-        ConcreteArgSpec arg_spec =
-            local_tensor_backing.resolve_runtime_arg_ref_spec(ref_spec);
-
-        PerDeviceFFHandle result_handle = arg_spec.get<PerDeviceFFHandle>();
-        CHECK(result_handle == handle);
-      }
-    }
-  }
-}
diff --git a/lib/local-execution/test/src/test_local_tensor_backing.cc b/lib/local-execution/test/src/test_local_tensor_backing.cc
new file mode 100644
index 0000000000..083b677e18
--- /dev/null
+++ b/lib/local-execution/test/src/test_local_tensor_backing.cc
@@ -0,0 +1,152 @@
+#include "local-execution/local_cpu_allocator.h"
+#include "local-execution/local_tensor_backing.h"
+#include "test_utils.h"
+#include "utils/containers/keys.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+bool is_shape_and_dtype_equal_for_tensor_backings(
+    std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> const &m1,
+    std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> const &m2) {
+  if (keys(m1) == keys(m2)) {
+    for (std::pair<TensorTypeVariant, GenericTensorAccessorW> const
+             &tensor_type_backing : m1) {
+      if (is_shape_and_dtype_equal(tensor_type_backing.second,
+                                   m2.at(tensor_type_backing.first))) {
+        continue;
+      } else {
+        return false;
+      }
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("LocalTensorBacking") {
+    MockTensorGuidSource tensor_guid_source;
+    GradientTensorSource gradient_tensor_source;
+    OptimizerTensorSource optimizer_tensor_source;
+    LossTensorSource loss_tensor_source;
+
+    SUBCASE("merge_optimizer_mappings") {
+      SUBCASE("Both empty") {
+        std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
+            result = merge_optimizer_mappings({}, {});
+        std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
+            correct = {};
+        CHECK(result == correct);
+      }
+
+      tensor_guid_t allocated_tensor_guid =
+          tensor_guid_source.new_mock_tensor_guid();
+      optimizer_tensor_t optimizer_tensor_1 =
+          optimizer_tensor_source.new_optimizer_tensor();
+      optimizer_tensor_t optimizer_tensor_2 =
+          optimizer_tensor_source.new_optimizer_tensor();
+      std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
+          correct = {{allocated_tensor_guid,
+                      {optimizer_tensor_1, optimizer_tensor_2}}};
+      SUBCASE("Unallocated is empty") {
+        std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
+            allocated = {{allocated_tensor_guid,
+                          {optimizer_tensor_1, optimizer_tensor_2}}};
+        std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
+            result = merge_optimizer_mappings(allocated, {});
+        CHECK(result == correct);
+      }
+      SUBCASE("Allocated is empty") {
+        std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
+            unallocated = {{allocated_tensor_guid,
+                            {optimizer_tensor_1, optimizer_tensor_2}}};
+        std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
+            result = merge_optimizer_mappings({}, unallocated);
+        CHECK(result == correct);
+      }
+
+      SUBCASE("Both are partially allocated") {
+        std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
+            allocated = {{allocated_tensor_guid, {optimizer_tensor_1}}};
+        std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
+            unallocated = {{allocated_tensor_guid, {optimizer_tensor_2}}};
+        std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
+            result = merge_optimizer_mappings(allocated, unallocated);
+        CHECK(result == correct);
+      }
+    }
+
+    SUBCASE("get_tensor_backings") {
+      Allocator allocator = create_local_cpu_memory_allocator();
+      SUBCASE("Both are empty") {
+        std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> result =
+            get_tensor_backings({}, {}, allocator);
+        std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> correct =
+            {};
+        CHECK(result == correct);
+      }
+
+      tensor_guid_t allocated_tensor_guid =
+          tensor_guid_source.new_mock_tensor_guid();
+      tensor_guid_t unallocated_tensor_guid =
+          tensor_guid_source.new_mock_tensor_guid();
+
+      TensorAttrs allocated_tensor_attrs = TensorAttrs{
+          TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 10_n}},
+                      DataType::FLOAT},
+          std::nullopt,
+          std::nullopt,
+          CreateGrad::NO};
+      TensorAttrs unallocated_tensor_attrs = TensorAttrs{
+          TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 20_n}},
+                      DataType::FLOAT},
+          std::nullopt,
+          std::nullopt,
+          CreateGrad::YES};
+
+      GenericTensorAccessorW allocated_tensor_backing =
+          allocator.allocate_tensor(allocated_tensor_attrs.shape);
+      GenericTensorAccessorW unallocated_tensor_backing =
+          allocator.allocate_tensor(unallocated_tensor_attrs.shape);
+
+      SUBCASE("Unallocated is empty") {
+        std::unordered_map<TensorTypeVariant, GenericTensorAccessorW>
+            allocated = {{TensorTypeVariant{allocated_tensor_guid},
+                          allocated_tensor_backing}};
+        std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> result =
+            get_tensor_backings(allocated, {}, allocator);
+        CHECK(result == allocated);
+      }
+      SUBCASE("Allocated is empty") {
+        std::unordered_map<TensorTypeVariant, TensorShape> unallocated = {
+            {TensorTypeVariant{unallocated_tensor_guid},
+             unallocated_tensor_attrs.shape}};
+        std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> result =
+            get_tensor_backings({}, unallocated, allocator);
+        std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> correct =
+            {{TensorTypeVariant{unallocated_tensor_guid},
+              unallocated_tensor_backing}};
+        CHECK(is_shape_and_dtype_equal_for_tensor_backings(result, correct));
+      }
+      SUBCASE("Both are partially allocated") {
+        std::unordered_map<TensorTypeVariant, GenericTensorAccessorW>
+            allocated = {{TensorTypeVariant{allocated_tensor_guid},
+                          allocated_tensor_backing}};
+        std::unordered_map<TensorTypeVariant, TensorShape> unallocated = {
+            {TensorTypeVariant{unallocated_tensor_guid},
+             unallocated_tensor_attrs.shape}};
+
+        std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> result =
+            get_tensor_backings(allocated, unallocated, allocator);
+        std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> correct =
+            {{TensorTypeVariant{allocated_tensor_guid},
+              allocated_tensor_backing},
+             {TensorTypeVariant{unallocated_tensor_guid},
+              unallocated_tensor_backing}};
+        CHECK(is_shape_and_dtype_equal_for_tensor_backings(result, correct));
+      }
+    }
+  }
+}
diff --git a/lib/local-execution/test/src/test_task_registry.cc b/lib/local-execution/test/src/test_task_registry.cc
index 20b4f11a2a..dd4b6f5b44 100644
--- a/lib/local-execution/test/src/test_task_registry.cc
+++ b/lib/local-execution/test/src/test_task_registry.cc
@@ -143,5 +143,74 @@ TEST_SUITE(FF_TEST_SUITE) {
         CHECK(task_registry != other_task_registry);
       }
     }
+
+    SUBCASE("registry_contains_task_for_layer") {
+      SUBCASE("Task exists") {
+        TaskRegistry task_registry = construct_task_registry({
+            {layer_guid, LayerAttrs{attrs, std::nullopt}},
+        });
+        SUBCASE("Init") {
+          bool result = registry_contains_task_for_layer(
+              task_registry, layer_guid, OpTaskType::INIT);
+          CHECK(result == true);
+        }
+        SUBCASE("Fwd") {
+          bool result = registry_contains_task_for_layer(
+              task_registry, layer_guid, OpTaskType::FWD);
+          CHECK(result == true);
+        }
+        SUBCASE("Bwd") {
+          bool result = registry_contains_task_for_layer(
+              task_registry, layer_guid, OpTaskType::BWD);
+          CHECK(result == true);
+        }
+      }
+
+      SUBCASE("Partial task does not exist") {
+        ComputationGraphOpAttrs bmm_attrs = ComputationGraphOpAttrs{
+            BatchMatmulAttrs{/*a_seq_length_dim=*/10_n,
+                             /*b_seq_length_dim=*/20_n}};
+        TaskRegistry task_registry = construct_task_registry({
+            {layer_guid, LayerAttrs{bmm_attrs, std::nullopt}},
+        });
+        SUBCASE("Init") {
+          bool result = registry_contains_task_for_layer(
+              task_registry, layer_guid, OpTaskType::INIT);
+          CHECK(result == false);
+        }
+        SUBCASE("Fwd") {
+          bool result = registry_contains_task_for_layer(
+              task_registry, layer_guid, OpTaskType::FWD);
+          CHECK(result == true);
+        }
+        SUBCASE("Bwd") {
+          bool result = registry_contains_task_for_layer(
+              task_registry, layer_guid, OpTaskType::BWD);
+          CHECK(result == true);
+        }
+      }
+
+      SUBCASE("Empty tasks") {
+        std::unordered_map<layer_guid_t, std::optional<task_id_t>>
+            empty_task_ids = {{layer_guid, std::nullopt}};
+        TaskRegistry task_registry =
+            TaskRegistry{empty_task_ids, empty_task_ids, empty_task_ids, {}};
+        SUBCASE("Init") {
+          bool result = registry_contains_task_for_layer(
+              task_registry, layer_guid, OpTaskType::INIT);
+          CHECK(result == false);
+        }
+        SUBCASE("Fwd") {
+          bool result = registry_contains_task_for_layer(
+              task_registry, layer_guid, OpTaskType::FWD);
+          CHECK(result == false);
+        }
+        SUBCASE("Bwd") {
+          bool result = registry_contains_task_for_layer(
+              task_registry, layer_guid, OpTaskType::BWD);
+          CHECK(result == false);
+        }
+      }
+    }
   }
 }
diff --git a/lib/local-execution/test/src/test_unallocated_tensors.cc b/lib/local-execution/test/src/test_unallocated_tensors.cc
index 00f4c1c27c..662e7b1878 100644
--- a/lib/local-execution/test/src/test_unallocated_tensors.cc
+++ b/lib/local-execution/test/src/test_unallocated_tensors.cc
@@ -1,9 +1,9 @@
 #include "local-execution/allocated_tensors.h"
 #include "local-execution/gradient_tensor_source.h"
 #include "local-execution/local_cpu_allocator.h"
-#include "local-execution/local_tensor_backing.h"
 #include "local-execution/loss_tensor_source.h"
 #include "local-execution/optimizer_tensor_source.h"
+#include "local-execution/unallocated_tensors.h"
 #include "pcg/computation_graph.dtg.h"
 #include "test/utils/doctest/fmt/pair.h"
 #include "test/utils/doctest/fmt/unordered_map.h"

From aef8ad58196f7b7f724fc7f0a1a65af24ee12acd Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Sat, 22 Feb 2025 07:43:31 -0800
Subject: [PATCH 48/91] Remove lowered tensor source

---
 .../local-execution/local_tensor_backing.h    |  1 -
 .../local-execution/lowered_tensor_source.h   | 20 -------------------
 .../src/lowered_tensor_source.cc              | 14 -------------
 3 files changed, 35 deletions(-)
 delete mode 100644 lib/local-execution/include/local-execution/lowered_tensor_source.h
 delete mode 100644 lib/local-execution/src/lowered_tensor_source.cc

diff --git a/lib/local-execution/include/local-execution/local_tensor_backing.h b/lib/local-execution/include/local-execution/local_tensor_backing.h
index 70a2474159..f6168f2fb1 100644
--- a/lib/local-execution/include/local-execution/local_tensor_backing.h
+++ b/lib/local-execution/include/local-execution/local_tensor_backing.h
@@ -8,7 +8,6 @@
 #include "local-execution/local_task_argument_accessor.h"
 #include "local-execution/local_tensor_backing.dtg.h"
 #include "local-execution/loss_tensor_source.h"
-#include "local-execution/lowered_tensor_source.h"
 #include "local-execution/optimizer_tensor_source.h"
 #include "local-execution/unallocated_tensors.dtg.h"
 #include "pcg/computation_graph.dtg.h"
diff --git a/lib/local-execution/include/local-execution/lowered_tensor_source.h b/lib/local-execution/include/local-execution/lowered_tensor_source.h
deleted file mode 100644
index bd0b90dd75..0000000000
--- a/lib/local-execution/include/local-execution/lowered_tensor_source.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_LOWERED_TENSOR_SOURCE_H
-#define _FLEXFLOW_LOCAL_EXECUTION_LOWERED_TENSOR_SOURCE_H
-
-#include "task-spec/lowered_tensor_t.dtg.h"
-
-namespace FlexFlow {
-
-struct LoweredTensorSource {
-public:
-  LoweredTensorSource();
-
-  lowered_tensor_t new_lowered_tensor();
-
-private:
-  static size_t next_available_lowered_tensor_id;
-};
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/src/lowered_tensor_source.cc b/lib/local-execution/src/lowered_tensor_source.cc
deleted file mode 100644
index af80aa2335..0000000000
--- a/lib/local-execution/src/lowered_tensor_source.cc
+++ /dev/null
@@ -1,14 +0,0 @@
-#include "local-execution/lowered_tensor_source.h"
-
-namespace FlexFlow {
-
-size_t LoweredTensorSource::next_available_lowered_tensor_id = 0;
-
-LoweredTensorSource::LoweredTensorSource() {}
-
-lowered_tensor_t LoweredTensorSource::new_lowered_tensor() {
-  return lowered_tensor_t{
-      LoweredTensorSource::next_available_lowered_tensor_id++};
-}
-
-} // namespace FlexFlow

From f0a4285bf4262bc793f9e4e8f4aa4e2c51d048fd Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Sun, 23 Feb 2025 17:25:21 -0800
Subject: [PATCH 49/91] Loss and update tests

---
 .../local-execution/allocated_tensors.h       |   2 +
 lib/local-execution/src/allocated_tensors.cc  |   4 +
 lib/local-execution/test/CMakeLists.txt       |   7 +-
 .../test/src/test_local_cost_estimator.cc     | 138 +++++++++---------
 lib/local-execution/test/src/test_loss_e2e.cc |  97 ------------
 .../test/src/test_loss_functions.cc           | 127 ++++++++++++++++
 lib/local-execution/test/src/test_update.cc   | 115 +++++++++++++++
 .../test/src/test_update_e2e.cc               |  93 ------------
 8 files changed, 319 insertions(+), 264 deletions(-)
 delete mode 100644 lib/local-execution/test/src/test_loss_e2e.cc
 create mode 100644 lib/local-execution/test/src/test_loss_functions.cc
 create mode 100644 lib/local-execution/test/src/test_update.cc
 delete mode 100644 lib/local-execution/test/src/test_update_e2e.cc

diff --git a/lib/local-execution/include/local-execution/allocated_tensors.h b/lib/local-execution/include/local-execution/allocated_tensors.h
index 7581a159ad..f3face6ace 100644
--- a/lib/local-execution/include/local-execution/allocated_tensors.h
+++ b/lib/local-execution/include/local-execution/allocated_tensors.h
@@ -25,6 +25,8 @@ bool is_allocated_tensor_backing_valid(
     std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> const &,
     ArrayShape const &);
 
+AllocatedTensors make_empty_allocated_tensors();
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/local-execution/src/allocated_tensors.cc b/lib/local-execution/src/allocated_tensors.cc
index 3e249bf6d1..2c40cc3b86 100644
--- a/lib/local-execution/src/allocated_tensors.cc
+++ b/lib/local-execution/src/allocated_tensors.cc
@@ -138,4 +138,8 @@ bool are_allocated_tensors_valid(
          are_allocated_optimizer_tensors_valid(allocated_tensors, tensor_attrs);
 }
 
+AllocatedTensors make_empty_allocated_tensors() {
+  return AllocatedTensors{{}, {}, {}};
+}
+
 } // namespace FlexFlow
diff --git a/lib/local-execution/test/CMakeLists.txt b/lib/local-execution/test/CMakeLists.txt
index 4bcb37ea48..930ab5c4e2 100644
--- a/lib/local-execution/test/CMakeLists.txt
+++ b/lib/local-execution/test/CMakeLists.txt
@@ -2,12 +2,7 @@ ff_add_test_executable(
   NAME
     local-execution-tests
   SRC_PATTERNS
-    src/test_allocated_tensors.cc
-    src/test_unallocated_tensors.cc
-    src/test_task_registry.cc
-    src/test_utils.cc
-    src/test_local_task_arg_accessor.cc
-    src/test_local_tensor_backing.cc
+    src/*.cc
   PRIVATE_INCLUDE 
     src/
   DEPS
diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc
index da3af6e3ad..7220d2a367 100644
--- a/lib/local-execution/test/src/test_local_cost_estimator.cc
+++ b/lib/local-execution/test/src/test_local_cost_estimator.cc
@@ -1,79 +1,81 @@
-// #include "doctest/doctest.h"
-// #include "kernels/local_cuda_allocator.h"
-// #include "kernels/managed_per_device_ff_handle.h"
-// #include "local-execution/local_cost_estimator.h"
-// #include "op-attrs/ops/attention.h"
-// #include "op-attrs/parallel_tensor_shape.h"
-// #include "pcg/computation_graph_builder.h"
-// #include "test_utils.h"
+#include "doctest/doctest.h"
+#include "kernels/local_cuda_allocator.h"
+#include "kernels/managed_per_device_ff_handle.h"
+#include "local-execution/local_cost_estimator.h"
+#include "op-attrs/ops/attention.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "pcg/computation_graph_builder.h"
+#include "test_utils.h"
 
-// using namespace ::FlexFlow;
+using namespace ::FlexFlow;
 
-// TEST_SUITE(FF_CUDA_TEST_SUITE) {
-//   TEST_CASE("Local Cost Estimator") {
-//     // local backing initialization
-//     ManagedPerDeviceFFHandle managed_handle{};
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("Local Cost Estimator") {
+    // local backing initialization
+    ManagedPerDeviceFFHandle managed_handle{};
 
-//     RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
-//         DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
-//         EnableProfiling::YES,
-//         ProfilingSettings{/*warmup_iters=*/0,
-//                           /*measure_iters=*/1}};
+    RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
+        DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
+        EnableProfiling::YES,
+        ProfilingSettings{/*warmup_iters=*/0,
+                          /*measure_iters=*/1}};
 
-//     LocalCostEstimator cost_estimator =
-//     LocalCostEstimator{runtime_arg_config};
+    LocalCostEstimator cost_estimator = LocalCostEstimator{runtime_arg_config};
 
-//     SUBCASE("Estimate cost -- Attention Op") {
-//       int embed_dim = 32;
-//       int num_heads = 10;
-//       MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{
-//           /*embed_dim=*/embed_dim,
-//           /*num_heads=*/num_heads,
-//           /*kdim=*/embed_dim,
-//           /*vdim=*/embed_dim,
-//           /*dropout=*/0.0,
-//           /*bias=*/true,
-//           /*add_bias_kv=*/false,
-//           /*add_zero_attn=*/false,
-//       };
+    SUBCASE("Estimate cost -- Attention Op") {
+      nonnegative_int embed_dim = 32_n;
+      nonnegative_int num_heads = 10_n;
+      MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{
+          /*embed_dim=*/embed_dim,
+          /*num_heads=*/num_heads,
+          /*kdim=*/embed_dim,
+          /*vdim=*/embed_dim,
+          /*dropout=*/0.0,
+          /*bias=*/true,
+          /*add_bias_kv=*/false,
+          /*add_zero_attn=*/false,
+      };
 
-//       size_t batch_size = 40;
-//       size_t seq_len = 48;
-//       size_t feature_size = 36;
+      nonnegative_int batch_size = 40_n;
+      nonnegative_int seq_len = 48_n;
+      nonnegative_int feature_size = 36_n;
 
-//       DataType dtype = DataType::FLOAT;
-//       ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{
-//           TensorDims{FFOrdered<size_t>{batch_size, seq_len, feature_size}},
-//           DataType::FLOAT,
-//       });
+      DataType dtype = DataType::FLOAT;
+      ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{
+          TensorDims{
+              FFOrdered<nonnegative_int>{batch_size, seq_len, feature_size}},
+          DataType::FLOAT,
+      });
 
-//       ParallelTensorShape weights_shape = throw_if_unexpected(
-//           get_weights_shape(attrs, inputs_shape, inputs_shape,
-//           inputs_shape));
-//       ParallelTensorAttrs weight_attrs =
-//           ParallelTensorAttrs{weights_shape,
-//                               /*sync_type=*/std::nullopt,
-//                               /*initializer=*/std::nullopt,
-//                               CreateGrad::YES};
+      ParallelTensorShape weights_shape = throw_if_unexpected(
+          get_weights_shape(attrs, inputs_shape, inputs_shape, inputs_shape));
+      ParallelTensorAttrs weight_attrs =
+          ParallelTensorAttrs{weights_shape,
+                              /*sync_type=*/std::nullopt,
+                              /*initializer=*/std::nullopt,
+                              CreateGrad::YES};
 
-//       ParallelTensorShape output_shape = throw_if_unexpected(
-//           get_output_shape(attrs, inputs_shape, inputs_shape, inputs_shape));
-//       ParallelTensorAttrs output_attrs =
-//           ParallelTensorAttrs{output_shape,
-//                               /*sync_type=*/std::nullopt,
-//                               /*initializer=*/std::nullopt,
-//                               CreateGrad::YES};
+      ParallelTensorShape output_shape = throw_if_unexpected(
+          get_output_shape(attrs, inputs_shape, inputs_shape, inputs_shape));
+      ParallelTensorAttrs output_attrs =
+          ParallelTensorAttrs{output_shape,
+                              /*sync_type=*/std::nullopt,
+                              /*initializer=*/std::nullopt,
+                              CreateGrad::YES};
 
-//       CostDetails result = cost_estimator.estimate_cost(
-//           PCGOperatorAttrs{attrs},
-//           std::vector<ParallelTensorShape>{
-//               inputs_shape, inputs_shape, inputs_shape},
-//           std::vector<ParallelTensorAttrs>{weight_attrs},
-//           std::vector<ParallelTensorAttrs>{output_attrs},
-//           make_1d_machine_view(gpu_id_t{0}, gpu_id_t{1}));
+      CostDetails result = cost_estimator.estimate_cost(
+          PCGOperatorAttrs{attrs},
+          std::vector<ParallelTensorShape>{
+              inputs_shape, inputs_shape, inputs_shape},
+          std::vector<ParallelTensorAttrs>{weight_attrs},
+          std::vector<ParallelTensorAttrs>{output_attrs},
+          make_1d_machine_view(
+              MachineSpaceCoordinate{0_n, 0_n, DeviceType::GPU},
+              MachineSpecificationDimension::INTRA_NODE,
+              stride_t{0_n}));
 
-//       CHECK(result.total_elapsed_time > 0);
-//       CHECK(result.total_mem_usage > 0);
-//     }
-//   }
-// }
+      CHECK(result.total_elapsed_time > 0);
+      CHECK(result.total_mem_usage > 0);
+    }
+  }
+}
diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc
deleted file mode 100644
index 62778c2e32..0000000000
--- a/lib/local-execution/test/src/test_loss_e2e.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-#include "doctest/doctest.h"
-#include "kernels/local_cuda_allocator.h"
-#include "kernels/managed_ff_stream.h"
-#include "kernels/managed_per_device_ff_handle.h"
-#include "local-execution/local_training_backing.h"
-
-#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
-#include "pcg/computation_graph.h"
-#include "pcg/computation_graph_builder.h"
-#include "pcg/optimizer_attrs.dtg.h"
-#include "test_utils.h"
-
-namespace FlexFlow {
-
-TEST_SUITE(FF_CUDA_TEST_SUITE) {
-  TEST_CASE("Local Execution E2E") {
-    // initialize runtime configs
-    ManagedPerDeviceFFHandle managed_handle{};
-
-    RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
-        DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
-        EnableProfiling::YES,
-        ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}};
-
-    // construct graph
-    ComputationGraphBuilder cg_builder;
-
-    size_t batch_size = 10;
-    size_t data_dim = 100;
-    TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered<size_t>{batch_size, data_dim}}, DataType::FLOAT};
-    tensor_guid_t input_tensor =
-        cg_builder.create_input(input_shape, CreateGrad::YES);
-
-    float scalar = 4.0;
-    std::string layer_name = "scalar multiply";
-    tensor_guid_t logit_tensor =
-        cg_builder.scalar_multiply(input_tensor, scalar, layer_name);
-    layer_guid_t layer_guid =
-        get_layer_by_name(cg_builder.computation_graph, layer_name);
-
-    // allocate memory
-    Allocator allocator = create_local_cuda_memory_allocator();
-
-    LocalTrainingBacking local_backing(allocator,
-                                       cg_builder.computation_graph,
-                                       LayerTensorBackingMap{},
-                                       TensorBackingMap{},
-                                       runtime_arg_config);
-
-    local_backing.register_and_allocate_layer(layer_guid);
-
-    SUBCASE("SparseCategoricalCrossEntropyLossAttrs") {
-      TensorShape label_shape = TensorShape{
-          TensorDims{FFOrdered<size_t>{batch_size, 1}}, DataType::FLOAT};
-      lowered_tensor_t label_tensor = lowered_tensor_t{-1};
-      GenericTensorAccessorW label_backing =
-          allocator.allocate_tensor(label_shape);
-      local_backing.local_tensor_backing.non_graph_tensor_mapping.insert(
-          {label_tensor, label_backing});
-      LossAttrs loss_attrs = LossAttrs{
-          SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}};
-      local_backing.compute_loss(loss_attrs, lower(logit_tensor), label_tensor);
-    }
-
-    SUBCASE("NonconfigurableLossAttrs") {
-      lowered_tensor_t label_tensor = lowered_tensor_t{-1};
-      GenericTensorAccessorW label_backing =
-          allocator.allocate_tensor(input_shape);
-      local_backing.local_tensor_backing.non_graph_tensor_mapping.insert(
-          {label_tensor, label_backing});
-
-      SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") {
-        LossAttrs loss_attrs = LossAttrs{
-            NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
-        local_backing.compute_loss(
-            loss_attrs, lower(logit_tensor), label_tensor);
-      }
-
-      SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") {
-        LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{
-            LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}};
-        local_backing.compute_loss(
-            loss_attrs, lower(logit_tensor), label_tensor);
-      }
-
-      SUBCASE("LossFunction::IDENTITY") {
-        LossAttrs loss_attrs =
-            LossAttrs{NonconfigurableLossAttrs{LossFunction::IDENTITY}};
-        local_backing.compute_loss(
-            loss_attrs, lower(logit_tensor), label_tensor);
-      }
-    }
-  }
-}
-
-} // namespace FlexFlow
diff --git a/lib/local-execution/test/src/test_loss_functions.cc b/lib/local-execution/test/src/test_loss_functions.cc
new file mode 100644
index 0000000000..c0386a4171
--- /dev/null
+++ b/lib/local-execution/test/src/test_loss_functions.cc
@@ -0,0 +1,127 @@
+#include "doctest/doctest.h"
+#include "kernels/local_cuda_allocator.h"
+#include "kernels/managed_ff_stream.h"
+#include "kernels/managed_per_device_ff_handle.h"
+#include "local-execution/allocated_tensors.h"
+#include "local-execution/local_training_backing.h"
+#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
+#include "pcg/computation_graph.h"
+#include "pcg/computation_graph_builder.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "test_utils.h"
+
+namespace FlexFlow {
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("Loss Functions") {
+    Allocator allocator = create_local_cuda_memory_allocator();
+
+    // allocate label tensors
+    LossTensorSource loss_tensor_source;
+    loss_tensor_t label_for_nonconfigurable_loss_attrs =
+        loss_tensor_source.new_loss_tensor();
+    loss_tensor_t label_for_sparse_cce_loss_attrs =
+        loss_tensor_source.new_loss_tensor();
+
+    nonnegative_int batch_size = 10_n;
+    nonnegative_int data_dim = 100_n;
+
+    TensorShape input_tensor_shape = TensorShape{
+        TensorDims{FFOrdered<nonnegative_int>{batch_size, data_dim}},
+        DataType::FLOAT};
+    TensorShape reduced_input_tensor_shape =
+        TensorShape{TensorDims{FFOrdered<nonnegative_int>{batch_size, 1_n}},
+                    DataType::FLOAT};
+
+    GenericTensorAccessorW label_for_nonconfigurable_loss_attrs_backing =
+        allocator.allocate_tensor(reduced_input_tensor_shape);
+    GenericTensorAccessorW label_for_sparse_cce_loss_attrs_backing =
+        allocator.allocate_tensor(reduced_input_tensor_shape);
+    AllocatedTensors allocated_tensors = AllocatedTensors{
+        {{TensorTypeVariant{label_for_nonconfigurable_loss_attrs},
+          label_for_nonconfigurable_loss_attrs_backing},
+         {TensorTypeVariant{label_for_sparse_cce_loss_attrs},
+          label_for_sparse_cce_loss_attrs_backing}},
+        {},
+        {}};
+
+    // construct computation graph
+    ComputationGraph computation_graph = make_empty_computation_graph();
+
+    TensorAttrs input_tensor_attrs = TensorAttrs{
+        input_tensor_shape, std::nullopt, std::nullopt, CreateGrad::YES};
+
+    LayerAddedResult inputs_layer =
+        add_layer(computation_graph,
+                  LayerAttrs{ComputationGraphOpAttrs{InputAttrs{}}, "inputs"},
+                  {},
+                  {input_tensor_attrs});
+
+    float scalar = 4.0;
+    LayerAddedResult scalar_multiply_operator =
+        add_layer(computation_graph,
+                  LayerAttrs{ComputationGraphOpAttrs{ElementUnaryAttrs{
+                                 OperatorType::SCALAR_MULTIPLY, scalar}},
+                             "scalar_mult"},
+                  inputs_layer.outputs,
+                  {input_tensor_attrs});
+    tensor_guid_t label_tensor = scalar_multiply_operator.outputs.at(0);
+
+    // initialize runtime configs
+    ManagedPerDeviceFFHandle managed_handle{};
+
+    RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
+        DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
+        EnableProfiling::YES,
+        ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}};
+
+    // initialize training backing
+    LocalTrainingBacking local_training_backing = LocalTrainingBacking{
+        allocator, allocated_tensors, computation_graph, runtime_arg_config};
+
+    SUBCASE("SparseCategoricalCrossEntropyLossAttrs") {
+      LossAttrs loss_attrs = LossAttrs{
+          SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}};
+
+      compute_loss(local_training_backing,
+                   loss_attrs,
+                   label_tensor,
+                   label_for_sparse_cce_loss_attrs,
+                   allocator);
+    }
+
+    SUBCASE("NonconfigurableLossAttrs") {
+      SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") {
+        LossAttrs loss_attrs = LossAttrs{
+            NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
+        compute_loss(local_training_backing,
+                     loss_attrs,
+                     label_tensor,
+                     label_for_nonconfigurable_loss_attrs,
+                     allocator);
+      }
+
+      SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") {
+        LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{
+            LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}};
+        compute_loss(local_training_backing,
+                     loss_attrs,
+                     label_tensor,
+                     label_for_nonconfigurable_loss_attrs,
+                     allocator);
+      }
+
+      SUBCASE("LossFunction::IDENTITY") {
+        LossAttrs loss_attrs =
+            LossAttrs{NonconfigurableLossAttrs{LossFunction::IDENTITY}};
+        compute_loss(local_training_backing,
+                     loss_attrs,
+                     label_tensor,
+                     label_for_nonconfigurable_loss_attrs,
+                     allocator);
+      }
+    }
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/test/src/test_update.cc b/lib/local-execution/test/src/test_update.cc
new file mode 100644
index 0000000000..3121d8e02b
--- /dev/null
+++ b/lib/local-execution/test/src/test_update.cc
@@ -0,0 +1,115 @@
+#include "doctest/doctest.h"
+#include "kernels/local_cuda_allocator.h"
+#include "kernels/managed_ff_stream.h"
+#include "kernels/managed_per_device_ff_handle.h"
+#include "local-execution/allocated_tensors.h"
+#include "local-execution/local_training_backing.h"
+#include "pcg/computation_graph.h"
+#include "pcg/computation_graph_builder.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "test_utils.h"
+
+namespace FlexFlow {
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("Execute Update") {
+    Allocator allocator = create_local_cuda_memory_allocator();
+    AllocatedTensors allocated_tensors = make_empty_allocated_tensors();
+
+    // construct computation graph
+    ComputationGraph computation_graph = make_empty_computation_graph();
+
+    nonnegative_int batch_size = 10_n;
+    nonnegative_int data_dim = 100_n;
+
+    TensorShape input_tensor_shape = TensorShape{
+        TensorDims{FFOrdered<nonnegative_int>{batch_size, data_dim}},
+        DataType::FLOAT};
+
+    TensorAttrs input_tensor_attrs = TensorAttrs{
+        input_tensor_shape, std::nullopt, std::nullopt, CreateGrad::YES};
+
+    LayerAddedResult inputs_layer =
+        add_layer(computation_graph,
+                  LayerAttrs{ComputationGraphOpAttrs{InputAttrs{}}, "inputs"},
+                  {},
+                  {input_tensor_attrs});
+
+    float scalar = 4.0;
+    LayerAddedResult scalar_multiply_operator =
+        add_layer(computation_graph,
+                  LayerAttrs{ComputationGraphOpAttrs{ElementUnaryAttrs{
+                                 OperatorType::SCALAR_MULTIPLY, scalar}},
+                             "scalar_mult"},
+                  inputs_layer.outputs,
+                  {input_tensor_attrs});
+
+    // initialize runtime configs
+    ManagedPerDeviceFFHandle managed_handle{};
+
+    RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
+        DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
+        EnableProfiling::YES,
+        ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}};
+
+    SUBCASE("SGDOptimizerAttrs") {
+      SUBCASE("momentum=0") {
+        OptimizerAttrs optimizer_attrs =
+            OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                             /*momentum=*/0.0f,
+                                             /*nesterov=*/false,
+                                             /*weight_decay=*/0.001}};
+        LocalTrainingBacking local_training_backing =
+            LocalTrainingBacking{allocator,
+                                 allocated_tensors,
+                                 computation_graph,
+                                 runtime_arg_config,
+                                 optimizer_attrs};
+        execute_update(local_training_backing,
+                       scalar_multiply_operator.layer,
+                       optimizer_attrs,
+                       allocator);
+      }
+      SUBCASE("momentum=0.9") {
+        OptimizerAttrs optimizer_attrs =
+            OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                             /*momentum=*/0.9,
+                                             /*nesterov=*/false,
+                                             /*weight_decay=*/0.001}};
+        LocalTrainingBacking local_training_backing =
+            LocalTrainingBacking{allocator,
+                                 allocated_tensors,
+                                 computation_graph,
+                                 runtime_arg_config,
+                                 optimizer_attrs};
+        execute_update(local_training_backing,
+                       scalar_multiply_operator.layer,
+                       optimizer_attrs,
+                       allocator);
+      }
+    }
+    SUBCASE("AdamOptimizerAttrs") {
+      OptimizerAttrs optimizer_attrs =
+          OptimizerAttrs{AdamOptimizerAttrs{/*alpha=*/0.001,
+                                            /*beta1=*/0.9,
+                                            /*beta2=*/0.999,
+                                            /*weight_decay=*/0.001,
+                                            /*alpha_t=*/0.001,
+                                            /*beta_t=*/0.9,
+                                            /*beta2_t=*/0.999,
+                                            /*epsilon=*/1e-8}};
+      LocalTrainingBacking local_training_backing =
+          LocalTrainingBacking{allocator,
+                               allocated_tensors,
+                               computation_graph,
+                               runtime_arg_config,
+                               optimizer_attrs};
+      execute_update(local_training_backing,
+                     scalar_multiply_operator.layer,
+                     optimizer_attrs,
+                     allocator);
+    }
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc
deleted file mode 100644
index 4658a2a544..0000000000
--- a/lib/local-execution/test/src/test_update_e2e.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-#include "doctest/doctest.h"
-#include "kernels/local_cuda_allocator.h"
-#include "kernels/managed_ff_stream.h"
-#include "kernels/managed_per_device_ff_handle.h"
-#include "local-execution/local_training_backing.h"
-
-#include "pcg/computation_graph.h"
-#include "pcg/computation_graph_builder.h"
-#include "pcg/optimizer_attrs.dtg.h"
-#include "test_utils.h"
-
-namespace FlexFlow {
-
-TEST_SUITE(FF_CUDA_TEST_SUITE) {
-  TEST_CASE("Local Execution Update E2E") {
-    // initialize runtime configs
-    ManagedPerDeviceFFHandle managed_handle{};
-
-    RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
-        DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
-        EnableProfiling::YES,
-        ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}};
-
-    // construct graph
-    ComputationGraphBuilder cg_builder;
-
-    size_t batch_size = 10;
-    size_t data_dim = 100;
-    TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered<size_t>{batch_size, data_dim}}, DataType::FLOAT};
-    tensor_guid_t input_tensor =
-        cg_builder.create_input(input_shape, CreateGrad::YES);
-
-    float scalar = 4.0;
-    std::string layer_name = "scalar_multiply";
-    tensor_guid_t logit_tensor =
-        cg_builder.scalar_multiply(input_tensor, scalar, layer_name);
-
-    // allocate memory
-    Allocator allocator = create_local_cuda_memory_allocator();
-    LocalTrainingBacking local_backing(allocator,
-                                       cg_builder.computation_graph,
-                                       LayerTensorBackingMap{},
-                                       TensorBackingMap{},
-                                       runtime_arg_config);
-    // for (layer_guid_t const & node:
-    // topological_ordering(cg_builder.computation_graph)) {
-    //   local_backing.register_and_allocate_layer(node);
-    // }
-    layer_guid_t layer_guid =
-        get_layer_by_name(cg_builder.computation_graph, layer_name);
-    local_backing.register_and_allocate_layer(layer_guid);
-
-    SUBCASE("SGDOptimizerAttrs") {
-      SUBCASE("momentum=0") {
-        OptimizerAttrs optimizer_attrs =
-            OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
-                                             /*momentum=*/0.0f,
-                                             /*nesterov=*/false,
-                                             /*weight_decay=*/0.001}};
-        local_backing.allocate_layer_optimizer_tensors(layer_guid,
-                                                       optimizer_attrs);
-        local_backing.execute_update(layer_guid, optimizer_attrs);
-      }
-      SUBCASE("momentum=0.9") {
-        OptimizerAttrs optimizer_attrs =
-            OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
-                                             /*momentum=*/0.9,
-                                             /*nesterov=*/false,
-                                             /*weight_decay=*/0.001}};
-        local_backing.allocate_layer_optimizer_tensors(layer_guid,
-                                                       optimizer_attrs);
-        local_backing.execute_update(layer_guid, optimizer_attrs);
-      }
-    }
-    SUBCASE("AdamOptimizerAttrs") {
-      OptimizerAttrs optimizer_attrs =
-          OptimizerAttrs{AdamOptimizerAttrs{/*alpha=*/0.001,
-                                            /*beta1=*/0.9,
-                                            /*beta2=*/0.999,
-                                            /*weight_decay=*/0.001,
-                                            /*alpha_t=*/0.001,
-                                            /*beta_t=*/0.9,
-                                            /*beta2_t=*/0.999,
-                                            /*epsilon=*/1e-8}};
-      local_backing.allocate_layer_optimizer_tensors(layer_guid,
-                                                     optimizer_attrs);
-      local_backing.execute_update(layer_guid, optimizer_attrs);
-    }
-  }
-}
-
-} // namespace FlexFlow

From 350babf3584c3d99e76e4dc0f72a658aa0222afc Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Sun, 23 Feb 2025 19:11:42 -0800
Subject: [PATCH 50/91] Passing tests after merge issues

---
 lib/local-execution/src/allocated_tensors.cc  |   5 +-
 .../src/local_cost_estimator.cc               |  59 +++--
 .../src/local_training_backing.cc             |   2 +-
 lib/local-execution/src/optimizer.cc          | 224 +++++++++++-------
 lib/local-execution/src/task_registry.cc      |   2 +-
 .../src/unallocated_tensors.cc                |   4 +-
 .../test/src/test_allocated_tensors.cc        |   6 -
 .../test/src/test_local_cost_estimator.cc     |  10 +-
 .../test/src/test_local_tensor_backing.cc     |   4 -
 .../test/src/test_loss_functions.cc           |  74 +++---
 .../test/src/test_unallocated_tensors.cc      |   6 -
 lib/local-execution/test/src/test_update.cc   |  51 ++--
 12 files changed, 247 insertions(+), 200 deletions(-)

diff --git a/lib/local-execution/src/allocated_tensors.cc b/lib/local-execution/src/allocated_tensors.cc
index 2c40cc3b86..196da16ace 100644
--- a/lib/local-execution/src/allocated_tensors.cc
+++ b/lib/local-execution/src/allocated_tensors.cc
@@ -54,8 +54,7 @@ bool are_allocated_gradient_tensors_valid(
   for (std::pair<tensor_guid_t, gradient_tensor_t> const &tensor_to_grad :
        allocated_tensors.gradient_mapping) {
     if (tensor_attrs.count(tensor_to_grad.first)) {
-      if (tensor_attrs.at(tensor_to_grad.first).create_gradients ==
-          CreateGrad::NO) {
+      if (tensor_attrs.at(tensor_to_grad.first).create_grad == CreateGrad::NO) {
         return false;
       }
 
@@ -96,7 +95,7 @@ bool are_allocated_optimizer_tensors_valid(
   for (std::pair<tensor_guid_t, std::vector<optimizer_tensor_t>> const
            &tensor_to_optimizers : allocated_tensors.optimizer_mapping) {
     if (tensor_attrs.count(tensor_to_optimizers.first)) {
-      if (tensor_attrs.at(tensor_to_optimizers.first).create_gradients ==
+      if (tensor_attrs.at(tensor_to_optimizers.first).create_grad ==
           CreateGrad::NO) {
         return false;
       }
diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
index 5c17f011e4..9828a67293 100644
--- a/lib/local-execution/src/local_cost_estimator.cc
+++ b/lib/local-execution/src/local_cost_estimator.cc
@@ -4,13 +4,13 @@
 #include "local-execution/tracked_allocator.h"
 #include "op-attrs/computation_graph_op_attrs.h"
 #include "op-attrs/pcg_operator_attrs.h"
-#include "pcg/computation_graph/layer_added_result.dtg.h"
 #include "pcg/computation_graph.h"
+#include "pcg/computation_graph/layer_added_result.dtg.h"
 #include "pcg/machine_view.dtg.h"
 #include "pcg/parallel_tensor_attrs.h"
 #include "utils/containers/concat_vectors.h"
+#include "utils/containers/get_only.h"
 #include "utils/containers/sum.h"
-#include "pcg/parallel_tensor_attrs.h"
 #include "utils/containers/transform.h"
 #include "utils/containers/values.h"
 
@@ -26,41 +26,36 @@ static ComputationGraph create_computation_graph_for_local_cost_estimation(
     std::vector<ParallelTensorAttrs> const &outputs) {
   ComputationGraph computation_graph = make_empty_computation_graph();
 
-  // create layer for inputs
-  auto get_vector_piece_attrs_from_parallel_tensor_shape =
-      [](std::vector<ParallelTensorShape> const &parallel_shapes) {
-        return transform(parallel_shapes, [](ParallelTensorShape const &p) {
-          return TensorAttrs{
-              get_piece_shape(p), std::nullopt, std::nullopt, CreateGrad::YES};
-        });
-      };
-
-  LayerAddedResult inputs_layer =
-      add_layer(computation_graph,
-                LayerAttrs{ComputationGraphOpAttrs{InputAttrs{}}, "inputs"},
-                {},
-                get_vector_piece_attrs_from_parallel_tensor_shape(inputs));
-
-  // create layer for weights
-  auto get_vector_piece_attrs_from_parallel_tensor_attrs =
-      [](std::vector<ParallelTensorAttrs> const &parallel_attrs) {
-        return transform(parallel_attrs, [](ParallelTensorAttrs const &p) {
-          return get_piece_attrs(p);
-        });
-      };
-
-  LayerAddedResult weights_layer =
-      add_layer(computation_graph,
-                LayerAttrs{ComputationGraphOpAttrs{InputAttrs{}}, "weights"},
-                {},
-                get_vector_piece_attrs_from_parallel_tensor_attrs(weights));
+  std::vector<tensor_guid_t> input_tensors;
+  for (ParallelTensorShape const &input : inputs) {
+    LayerAddedResult inputs_layer = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{InputAttrs{get_piece_shape(input)}},
+                   std::nullopt},
+        {},
+        {});
+    input_tensors.push_back(get_only(inputs_layer.outputs));
+  }
+
+  std::vector<tensor_guid_t> weight_tensors;
+  for (ParallelTensorAttrs const &weight : weights) {
+    LayerAddedResult weights_layer =
+        add_layer(computation_graph,
+                  LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
+                                 get_piece_shape(weight.shape),
+                                 InitializerAttrs{ZeroInitializerAttrs{}}}},
+                             std::nullopt},
+                  {},
+                  {});
+    weight_tensors.push_back(get_only(weights_layer.outputs));
+  }
 
   // create operator layer
   LayerAddedResult operator_layer = add_layer(
       computation_graph,
       LayerAttrs{compgraph_op_attrs_from_pcg_op_attrs(op), "operator"},
-      concat_vectors(inputs_layer.outputs, weights_layer.outputs),
-      get_vector_piece_attrs_from_parallel_tensor_attrs(outputs));
+      input_tensors,
+      weight_tensors);
 
   return computation_graph;
 }
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index df15c707b2..77e62e52af 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -213,7 +213,7 @@ void execute_update(LocalTrainingBacking const &local_training_backing,
                     Allocator &allocator) {
   LayerAttrs layer_attrs =
       get_layer_attrs(local_training_backing.computation_graph, node);
-  if (layer_attrs.attrs.has<WeightAttrs>()) {
+  if (layer_attrs.op_attrs.has<WeightAttrs>()) {
     // get tensors
     tensor_guid_t weight_tensor = get_only(
         get_outgoing_tensors(local_training_backing.computation_graph, node));
diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc
index a69ae9da61..1b9ce83d14 100644
--- a/lib/local-execution/src/optimizer.cc
+++ b/lib/local-execution/src/optimizer.cc
@@ -1,6 +1,7 @@
 #include "local-execution/optimizer.h"
 #include "kernels/optimizer_kernels.h"
 #include "task-spec/profiling.h"
+#include "utils/containers/get_only.h"
 #include "utils/overload.h"
 
 namespace FlexFlow {
@@ -24,9 +25,12 @@ TaskSignature get_sgd_update_signature() {
 
   add_arg_slot<SGDOptimizerAttrs>(sig, ATTRS);
   add_arg_slot<ProfilingSettings>(sig, PROFILING);
-  if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
-    add_unchecked_arg_slot<PerDeviceFFHandle>(sig, HANDLE);
-  }
+  add_unchecked_arg_slot<PerDeviceFFHandle>(
+      sig, HANDLE); // how to deal with removal of ParamSync?
+
+  // if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
+  //   add_unchecked_arg_slot<PerDeviceFFHandle>(sig, HANDLE);
+  // }
   return sig;
 }
 
@@ -44,12 +48,16 @@ TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs,
   b.bind_arg(ATTRS, attrs);
   b.bind_arg(PROFILING, profiling_settings());
 
-  if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
-    b.bind_arg(HANDLE, ff_handle());
-    return TaskInvocation{task_id_t::SGD_UPD_NCCL_TASK_ID, b};
-  } else {
-    return TaskInvocation{task_id_t::SGD_UPD_PS_TASK_ID, b};
-  }
+  b.bind_arg(HANDLE, ff_handle());
+  return TaskInvocation{task_id_t::SGD_UPD_NCCL_TASK_ID,
+                        b}; // how to deal with removal of ParamSync?
+
+  // if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
+  //   b.bind_arg(HANDLE, ff_handle());
+  //   return TaskInvocation{task_id_t::SGD_UPD_NCCL_TASK_ID, b};
+  // } else {
+  //   return TaskInvocation{task_id_t::SGD_UPD_PS_TASK_ID, b};
+  // }
 }
 
 static void sgd_update_task_impl(TaskArgumentAccessor const &acc) {
@@ -73,35 +81,49 @@ static void sgd_update_task_impl(TaskArgumentAccessor const &acc) {
     sgd_v_ptr = sgd_v.get_float_ptr();
   }
 
-  if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
-    auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
-    profile(sgd_nccl_update_task_gpu,
-            profiling,
-            "[SGD NCCL] update_time = %.2lfms\n",
-            attrs.lr,
-            attrs.momentum,
-            attrs.nesterov,
-            attrs.weight_decay,
-            handle,
-            weight_grad.get_float_ptr(),
-            size,
-            weight.get_float_ptr(),
-            sgd_v_ptr);
-
-  } else {
-    profile(sgd_ps_update_task_gpu,
-            profiling,
-            "[SGD PS] update_time = %.2lfms\n",
-            attrs.lr,
-            attrs.momentum,
-            attrs.nesterov,
-            attrs.weight_decay,
-            weight_grad.get_float_ptr(),
-            size,
-            num_replicas,
-            weight.get_float_ptr(),
-            sgd_v_ptr);
-  }
+  auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  profile(sgd_nccl_update_task_gpu,
+          profiling,
+          "[SGD NCCL] update_time = %.2lfms\n",
+          attrs.lr,
+          attrs.momentum,
+          attrs.nesterov,
+          attrs.weight_decay,
+          handle,
+          weight_grad.get_float_ptr(),
+          size,
+          weight.get_float_ptr(),
+          sgd_v_ptr); // how to deal with removal of ParamSync?
+
+  // if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
+  //   auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  //   profile(sgd_nccl_update_task_gpu,
+  //           profiling,
+  //           "[SGD NCCL] update_time = %.2lfms\n",
+  //           attrs.lr,
+  //           attrs.momentum,
+  //           attrs.nesterov,
+  //           attrs.weight_decay,
+  //           handle,
+  //           weight_grad.get_float_ptr(),
+  //           size,
+  //           weight.get_float_ptr(),
+  //           sgd_v_ptr);
+
+  // } else {
+  //   profile(sgd_ps_update_task_gpu,
+  //           profiling,
+  //           "[SGD PS] update_time = %.2lfms\n",
+  //           attrs.lr,
+  //           attrs.momentum,
+  //           attrs.nesterov,
+  //           attrs.weight_decay,
+  //           weight_grad.get_float_ptr(),
+  //           size,
+  //           num_replicas,
+  //           weight.get_float_ptr(),
+  //           sgd_v_ptr);
+  // }
 }
 
 TaskImplFunction get_sgd_update_task_impl() {
@@ -117,9 +139,11 @@ TaskSignature get_adam_update_signature() {
 
   add_arg_slot<AdamOptimizerAttrs>(sig, ATTRS);
   add_arg_slot<ProfilingSettings>(sig, PROFILING);
-  if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
-    add_unchecked_arg_slot<PerDeviceFFHandle>(sig, HANDLE);
-  }
+  add_unchecked_arg_slot<PerDeviceFFHandle>(
+      sig, HANDLE); // how to deal with removal of ParamSync?
+  // if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
+  //   add_unchecked_arg_slot<PerDeviceFFHandle>(sig, HANDLE);
+  // }
   return sig;
 }
 
@@ -135,13 +159,16 @@ TaskInvocation adam_update(AdamOptimizerAttrs const &attrs,
   b.bind_optimizer(ADAM_V, adam_v);
   b.bind_arg(ATTRS, attrs);
   b.bind_arg(PROFILING, profiling_settings());
+  b.bind_arg(HANDLE, ff_handle());
+  return TaskInvocation{task_id_t::ADAM_UPD_NCCL_TASK_ID,
+                        b}; // how to deal with removal of ParamSync?
 
-  if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
-    b.bind_arg(HANDLE, ff_handle());
-    return TaskInvocation{task_id_t::ADAM_UPD_NCCL_TASK_ID, b};
-  } else {
-    return TaskInvocation{task_id_t::ADAM_UPD_PS_TASK_ID, b};
-  }
+  // if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
+  //   b.bind_arg(HANDLE, ff_handle());
+  //   return TaskInvocation{task_id_t::ADAM_UPD_NCCL_TASK_ID, b};
+  // } else {
+  //   return TaskInvocation{task_id_t::ADAM_UPD_PS_TASK_ID, b};
+  // }
 }
 
 static void adam_update_task_impl(TaskArgumentAccessor const &acc) {
@@ -162,38 +189,54 @@ static void adam_update_task_impl(TaskArgumentAccessor const &acc) {
   int num_replicas = weight_grad.shape.get_volume().unwrap_nonnegative() /
                      weight.shape.get_volume().unwrap_nonnegative();
 
-  if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
-    auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
-    profile(adam_nccl_update_task_gpu,
-            profiling,
-            "[Adam NCCL] update_time = %.2lfms\n",
-            attrs.alpha_t,
-            attrs.beta1,
-            attrs.beta2,
-            attrs.weight_decay,
-            attrs.epsilon,
-            size,
-            handle,
-            weight_grad.get_float_ptr(),
-            m_tensor.get_float_ptr(),
-            v_tensor.get_float_ptr(),
-            weight.get_float_ptr());
-  } else {
-    profile(adam_ps_update_task_gpu,
-            profiling,
-            "[Adam NCCL] update_time = %.2lfms\n",
-            attrs.alpha_t,
-            attrs.beta1,
-            attrs.beta2,
-            attrs.weight_decay,
-            attrs.epsilon,
-            size,
-            num_replicas,
-            weight_grad.get_float_ptr(),
-            m_tensor.get_float_ptr(),
-            v_tensor.get_float_ptr(),
-            weight.get_float_ptr());
-  }
+  auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  profile(adam_nccl_update_task_gpu,
+          profiling,
+          "[Adam NCCL] update_time = %.2lfms\n",
+          attrs.alpha_t,
+          attrs.beta1,
+          attrs.beta2,
+          attrs.weight_decay,
+          attrs.epsilon,
+          size,
+          handle,
+          weight_grad.get_float_ptr(),
+          m_tensor.get_float_ptr(),
+          v_tensor.get_float_ptr(),
+          weight.get_float_ptr()); // how to deal with removal of ParamSync?
+
+  // if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
+  //   auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  //   profile(adam_nccl_update_task_gpu,
+  //           profiling,
+  //           "[Adam NCCL] update_time = %.2lfms\n",
+  //           attrs.alpha_t,
+  //           attrs.beta1,
+  //           attrs.beta2,
+  //           attrs.weight_decay,
+  //           attrs.epsilon,
+  //           size,
+  //           handle,
+  //           weight_grad.get_float_ptr(),
+  //           m_tensor.get_float_ptr(),
+  //           v_tensor.get_float_ptr(),
+  //           weight.get_float_ptr());
+  // } else {
+  //   profile(adam_ps_update_task_gpu,
+  //           profiling,
+  //           "[Adam NCCL] update_time = %.2lfms\n",
+  //           attrs.alpha_t,
+  //           attrs.beta1,
+  //           attrs.beta2,
+  //           attrs.weight_decay,
+  //           attrs.epsilon,
+  //           size,
+  //           num_replicas,
+  //           weight_grad.get_float_ptr(),
+  //           m_tensor.get_float_ptr(),
+  //           v_tensor.get_float_ptr(),
+  //           weight.get_float_ptr());
+  // }
 }
 
 TaskImplFunction get_adam_update_task_impl() {
@@ -211,17 +254,18 @@ TaskInvocation get_update_invocation(
     tensor_guid_t const &weight,
     gradient_tensor_t const &weight_grad,
     std::vector<optimizer_tensor_t> const &grad_buffer_tensors) {
-  return attrs.visit<TaskInvocation>(overload{
-      [&](SGDOptimizerAttrs const &s) {
-        return sgd_update(s, weight, weight_grad, grad_buffer_tensors.at(0));
-      },
-      [&](AdamOptimizerAttrs const &s) {
-        return adam_update(s,
-                           weight,
-                           weight_grad,
-                           grad_buffer_tensors.at(0),
-                           grad_buffer_tensors.at(1));
-      }});
+  return attrs.visit<TaskInvocation>(
+      overload{[&](SGDOptimizerAttrs const &s) {
+                 return sgd_update(
+                     s, weight, weight_grad, get_only(grad_buffer_tensors));
+               },
+               [&](AdamOptimizerAttrs const &s) {
+                 return adam_update(s,
+                                    weight,
+                                    weight_grad,
+                                    grad_buffer_tensors.at(0),
+                                    grad_buffer_tensors.at(1));
+               }});
 }
 
 TaskImplFunction get_update_task_impl(OptimizerAttrs const &attrs) {
diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc
index 487bd4420e..3d9dec1e26 100644
--- a/lib/local-execution/src/task_registry.cc
+++ b/lib/local-execution/src/task_registry.cc
@@ -19,7 +19,7 @@ TaskRegistry construct_task_registry(
     fwd_task_ids.insert({node, std::nullopt});
     bwd_task_ids.insert({node, std::nullopt});
 
-    ComputationGraphOpAttrs attrs = layer_attrs.second.attrs;
+    ComputationGraphOpAttrs attrs = layer_attrs.second.op_attrs;
     std::vector<task_id_t> task_ids = get_task_ids(attrs);
 
     for (task_id_t const &task_id : task_ids) {
diff --git a/lib/local-execution/src/unallocated_tensors.cc b/lib/local-execution/src/unallocated_tensors.cc
index ea64a46051..363d1eedef 100644
--- a/lib/local-execution/src/unallocated_tensors.cc
+++ b/lib/local-execution/src/unallocated_tensors.cc
@@ -23,7 +23,7 @@ UnallocatedTensors generate_unallocated_tensors(
       tensor_type_shapes.insert({tensor_guid_type, tensor_attrs.shape});
     }
 
-    if (tensor_attrs.create_gradients == CreateGrad::YES &&
+    if (tensor_attrs.create_grad == CreateGrad::YES &&
         !allocated_tensors.gradient_mapping.count(tensor_guid)) {
       gradient_tensor_t gradient_tensor =
           gradient_tensor_source.new_gradient_tensor();
@@ -61,7 +61,7 @@ UnallocatedTensors generate_unallocated_tensors_with_optimizer(
        tensor_attrs_mapping) {
     tensor_guid_t tensor_guid = tensor_guid_attrs.first;
     TensorAttrs tensor_attrs = tensor_guid_attrs.second;
-    if (tensor_attrs.create_gradients == CreateGrad::YES) {
+    if (tensor_attrs.create_grad == CreateGrad::YES) {
       std::vector<optimizer_tensor_t> optimizer_tensors;
 
       int num_optimizer_tensors_to_allocate =
diff --git a/lib/local-execution/test/src/test_allocated_tensors.cc b/lib/local-execution/test/src/test_allocated_tensors.cc
index 99abd538d5..45fc8e0a1c 100644
--- a/lib/local-execution/test/src/test_allocated_tensors.cc
+++ b/lib/local-execution/test/src/test_allocated_tensors.cc
@@ -31,20 +31,14 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{
         TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 10_n}},
                     DataType::FLOAT},
-        std::nullopt,
-        std::nullopt,
         CreateGrad::NO};
     TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{
         TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 20_n}},
                     DataType::FLOAT},
-        std::nullopt,
-        std::nullopt,
         CreateGrad::NO};
     TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{
         TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 30_n}},
                     DataType::FLOAT},
-        std::nullopt,
-        std::nullopt,
         CreateGrad::YES};
 
     GenericTensorAccessorW tensor_backing_1 =
diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc
index 7220d2a367..30682c9a48 100644
--- a/lib/local-execution/test/src/test_local_cost_estimator.cc
+++ b/lib/local-execution/test/src/test_local_cost_estimator.cc
@@ -50,18 +50,12 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       ParallelTensorShape weights_shape = throw_if_unexpected(
           get_weights_shape(attrs, inputs_shape, inputs_shape, inputs_shape));
       ParallelTensorAttrs weight_attrs =
-          ParallelTensorAttrs{weights_shape,
-                              /*sync_type=*/std::nullopt,
-                              /*initializer=*/std::nullopt,
-                              CreateGrad::YES};
+          ParallelTensorAttrs{weights_shape, CreateGrad::YES};
 
       ParallelTensorShape output_shape = throw_if_unexpected(
           get_output_shape(attrs, inputs_shape, inputs_shape, inputs_shape));
       ParallelTensorAttrs output_attrs =
-          ParallelTensorAttrs{output_shape,
-                              /*sync_type=*/std::nullopt,
-                              /*initializer=*/std::nullopt,
-                              CreateGrad::YES};
+          ParallelTensorAttrs{output_shape, CreateGrad::YES};
 
       CostDetails result = cost_estimator.estimate_cost(
           PCGOperatorAttrs{attrs},
diff --git a/lib/local-execution/test/src/test_local_tensor_backing.cc b/lib/local-execution/test/src/test_local_tensor_backing.cc
index 083b677e18..594051c2f1 100644
--- a/lib/local-execution/test/src/test_local_tensor_backing.cc
+++ b/lib/local-execution/test/src/test_local_tensor_backing.cc
@@ -96,14 +96,10 @@ TEST_SUITE(FF_TEST_SUITE) {
       TensorAttrs allocated_tensor_attrs = TensorAttrs{
           TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 10_n}},
                       DataType::FLOAT},
-          std::nullopt,
-          std::nullopt,
           CreateGrad::NO};
       TensorAttrs unallocated_tensor_attrs = TensorAttrs{
           TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 20_n}},
                       DataType::FLOAT},
-          std::nullopt,
-          std::nullopt,
           CreateGrad::YES};
 
       GenericTensorAccessorW allocated_tensor_backing =
diff --git a/lib/local-execution/test/src/test_loss_functions.cc b/lib/local-execution/test/src/test_loss_functions.cc
index c0386a4171..bb3e83cc4d 100644
--- a/lib/local-execution/test/src/test_loss_functions.cc
+++ b/lib/local-execution/test/src/test_loss_functions.cc
@@ -9,6 +9,7 @@
 #include "pcg/computation_graph_builder.h"
 #include "pcg/optimizer_attrs.dtg.h"
 #include "test_utils.h"
+#include "utils/containers/get_only.h"
 
 namespace FlexFlow {
 
@@ -24,19 +25,20 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         loss_tensor_source.new_loss_tensor();
 
     nonnegative_int batch_size = 10_n;
-    nonnegative_int data_dim = 100_n;
+    nonnegative_int data_dim = 16_n;
+    nonnegative_int output_dim = 32_n;
 
-    TensorShape input_tensor_shape = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{batch_size, data_dim}},
+    TensorShape output_tensor_shape = TensorShape{
+        TensorDims{FFOrdered<nonnegative_int>{batch_size, output_dim}},
         DataType::FLOAT};
-    TensorShape reduced_input_tensor_shape =
+    TensorShape reduced_tensor_shape =
         TensorShape{TensorDims{FFOrdered<nonnegative_int>{batch_size, 1_n}},
                     DataType::FLOAT};
 
     GenericTensorAccessorW label_for_nonconfigurable_loss_attrs_backing =
-        allocator.allocate_tensor(reduced_input_tensor_shape);
+        allocator.allocate_tensor(output_tensor_shape);
     GenericTensorAccessorW label_for_sparse_cce_loss_attrs_backing =
-        allocator.allocate_tensor(reduced_input_tensor_shape);
+        allocator.allocate_tensor(reduced_tensor_shape);
     AllocatedTensors allocated_tensors = AllocatedTensors{
         {{TensorTypeVariant{label_for_nonconfigurable_loss_attrs},
           label_for_nonconfigurable_loss_attrs_backing},
@@ -48,24 +50,40 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     // construct computation graph
     ComputationGraph computation_graph = make_empty_computation_graph();
 
-    TensorAttrs input_tensor_attrs = TensorAttrs{
-        input_tensor_shape, std::nullopt, std::nullopt, CreateGrad::YES};
-
-    LayerAddedResult inputs_layer =
-        add_layer(computation_graph,
-                  LayerAttrs{ComputationGraphOpAttrs{InputAttrs{}}, "inputs"},
-                  {},
-                  {input_tensor_attrs});
-
-    float scalar = 4.0;
-    LayerAddedResult scalar_multiply_operator =
-        add_layer(computation_graph,
-                  LayerAttrs{ComputationGraphOpAttrs{ElementUnaryAttrs{
-                                 OperatorType::SCALAR_MULTIPLY, scalar}},
-                             "scalar_mult"},
-                  inputs_layer.outputs,
-                  {input_tensor_attrs});
-    tensor_guid_t label_tensor = scalar_multiply_operator.outputs.at(0);
+    TensorShape input_tensor_shape = TensorShape{
+        TensorDims{FFOrdered<nonnegative_int>{batch_size, data_dim}},
+        DataType::FLOAT};
+
+    TensorShape weight_shape = TensorShape{
+        TensorDims{FFOrdered<nonnegative_int>{data_dim, output_dim}},
+        DataType::FLOAT};
+
+    LayerAddedResult inputs_layer = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{InputAttrs{input_tensor_shape}},
+                   "inputs"},
+        {},
+        {});
+
+    LayerAddedResult weights_layer = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
+                       weight_shape, InitializerAttrs{ZeroInitializerAttrs{}}}},
+                   "weights"},
+        {},
+        {});
+
+    LayerAddedResult linear_operator = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim,
+                                                       /*use_bias=*/true,
+                                                       DataType::FLOAT,
+                                                       std::nullopt,
+                                                       std::nullopt}},
+                   "linear"},
+        inputs_layer.outputs,
+        {});
+    tensor_guid_t logit_tensor = get_only(linear_operator.outputs);
 
     // initialize runtime configs
     ManagedPerDeviceFFHandle managed_handle{};
@@ -85,7 +103,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
       compute_loss(local_training_backing,
                    loss_attrs,
-                   label_tensor,
+                   logit_tensor,
                    label_for_sparse_cce_loss_attrs,
                    allocator);
     }
@@ -96,7 +114,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
             NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
         compute_loss(local_training_backing,
                      loss_attrs,
-                     label_tensor,
+                     logit_tensor,
                      label_for_nonconfigurable_loss_attrs,
                      allocator);
       }
@@ -106,7 +124,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
             LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}};
         compute_loss(local_training_backing,
                      loss_attrs,
-                     label_tensor,
+                     logit_tensor,
                      label_for_nonconfigurable_loss_attrs,
                      allocator);
       }
@@ -116,7 +134,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
             LossAttrs{NonconfigurableLossAttrs{LossFunction::IDENTITY}};
         compute_loss(local_training_backing,
                      loss_attrs,
-                     label_tensor,
+                     logit_tensor,
                      label_for_nonconfigurable_loss_attrs,
                      allocator);
       }
diff --git a/lib/local-execution/test/src/test_unallocated_tensors.cc b/lib/local-execution/test/src/test_unallocated_tensors.cc
index 662e7b1878..82f5a132fe 100644
--- a/lib/local-execution/test/src/test_unallocated_tensors.cc
+++ b/lib/local-execution/test/src/test_unallocated_tensors.cc
@@ -40,20 +40,14 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{
         TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 10_n}},
                     DataType::FLOAT},
-        std::nullopt,
-        std::nullopt,
         CreateGrad::NO};
     TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{
         TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 20_n}},
                     DataType::FLOAT},
-        std::nullopt,
-        std::nullopt,
         CreateGrad::NO};
     TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{
         TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 30_n}},
                     DataType::FLOAT},
-        std::nullopt,
-        std::nullopt,
         CreateGrad::YES};
 
     GenericTensorAccessorW tensor_backing_1 =
diff --git a/lib/local-execution/test/src/test_update.cc b/lib/local-execution/test/src/test_update.cc
index 3121d8e02b..d6108635af 100644
--- a/lib/local-execution/test/src/test_update.cc
+++ b/lib/local-execution/test/src/test_update.cc
@@ -20,29 +20,42 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     ComputationGraph computation_graph = make_empty_computation_graph();
 
     nonnegative_int batch_size = 10_n;
-    nonnegative_int data_dim = 100_n;
+    nonnegative_int data_dim = 16_n;
+    nonnegative_int output_dim = 32_n;
 
     TensorShape input_tensor_shape = TensorShape{
         TensorDims{FFOrdered<nonnegative_int>{batch_size, data_dim}},
         DataType::FLOAT};
 
-    TensorAttrs input_tensor_attrs = TensorAttrs{
-        input_tensor_shape, std::nullopt, std::nullopt, CreateGrad::YES};
+    TensorShape weight_shape = TensorShape{
+        TensorDims{FFOrdered<nonnegative_int>{data_dim, output_dim}},
+        DataType::FLOAT};
+
+    LayerAddedResult inputs_layer = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{InputAttrs{input_tensor_shape}},
+                   "inputs"},
+        {},
+        {});
 
-    LayerAddedResult inputs_layer =
-        add_layer(computation_graph,
-                  LayerAttrs{ComputationGraphOpAttrs{InputAttrs{}}, "inputs"},
-                  {},
-                  {input_tensor_attrs});
+    LayerAddedResult weights_layer = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
+                       weight_shape, InitializerAttrs{ZeroInitializerAttrs{}}}},
+                   "weights"},
+        {},
+        {});
 
-    float scalar = 4.0;
-    LayerAddedResult scalar_multiply_operator =
-        add_layer(computation_graph,
-                  LayerAttrs{ComputationGraphOpAttrs{ElementUnaryAttrs{
-                                 OperatorType::SCALAR_MULTIPLY, scalar}},
-                             "scalar_mult"},
-                  inputs_layer.outputs,
-                  {input_tensor_attrs});
+    LayerAddedResult linear_operator = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim,
+                                                       /*use_bias=*/true,
+                                                       DataType::FLOAT,
+                                                       std::nullopt,
+                                                       std::nullopt}},
+                   "linear"},
+        inputs_layer.outputs,
+        {});
 
     // initialize runtime configs
     ManagedPerDeviceFFHandle managed_handle{};
@@ -66,7 +79,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
                                  runtime_arg_config,
                                  optimizer_attrs};
         execute_update(local_training_backing,
-                       scalar_multiply_operator.layer,
+                       linear_operator.layer,
                        optimizer_attrs,
                        allocator);
       }
@@ -83,7 +96,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
                                  runtime_arg_config,
                                  optimizer_attrs};
         execute_update(local_training_backing,
-                       scalar_multiply_operator.layer,
+                       linear_operator.layer,
                        optimizer_attrs,
                        allocator);
       }
@@ -105,7 +118,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
                                runtime_arg_config,
                                optimizer_attrs};
       execute_update(local_training_backing,
-                     scalar_multiply_operator.layer,
+                     linear_operator.layer,
                      optimizer_attrs,
                      allocator);
     }

From aef7c6e3c3087f15b4c90792148f170da84f6f7c Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Tue, 25 Feb 2025 06:32:27 -0800
Subject: [PATCH 51/91] Pass gpu tests

---
 lib/kernels/src/array_shape.cc                |  6 +--
 lib/kernels/src/legion_dim.cc                 |  6 ---
 .../local-execution/local_training_backing.h  | 11 ++---
 .../src/local_cost_estimator.cc               |  3 ++
 .../src/local_training_backing.cc             | 28 ++++++-----
 lib/local-execution/src/task_registry.cc      |  1 +
 lib/local-execution/test/CMakeLists.txt       |  7 +++
 .../test/modify_test_commands.cmake           | 21 ++++++++
 .../test/src/test_loss_functions.cc           | 40 +++++++--------
 lib/local-execution/test/src/test_update.cc   | 36 ++++++++------
 .../include/task-spec/task_arg_spec.h         | 12 +++++
 lib/task-spec/src/op_task_invocation.cc       | 49 +++++++++----------
 lib/task-spec/src/task_arg_spec.cc            | 11 +++++
 lib/task-spec/src/task_invocation.cc          | 33 ++++++++++++-
 14 files changed, 173 insertions(+), 91 deletions(-)
 create mode 100644 lib/local-execution/test/modify_test_commands.cmake
 create mode 100644 lib/task-spec/include/task-spec/task_arg_spec.h
 create mode 100644 lib/task-spec/src/task_arg_spec.cc

diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
index 521b15e435..f4011af79f 100644
--- a/lib/kernels/src/array_shape.cc
+++ b/lib/kernels/src/array_shape.cc
@@ -64,16 +64,16 @@ ArrayShape ArrayShape::sub_shape(std::optional<ff_dim_t> start,
 
 ArrayShape ArrayShape::sub_shape(std::optional<legion_dim_t> start,
                                  std::optional<legion_dim_t> end) const {
-  std::optional<ff_dim_t> legion_start =
+  std::optional<ff_dim_t> ff_end =
       transform(start, [&](auto const &start_unwrapped) {
         return ff_dim_from_legion_dim(start_unwrapped, num_dims());
       });
 
-  std::optional<ff_dim_t> legion_end =
+  std::optional<ff_dim_t> ff_start =
       transform(end, [&](auto const &end_unwrapped) {
         return ff_dim_from_legion_dim(end_unwrapped, num_dims());
       });
-  return this->sub_shape(legion_start, legion_end);
+  return this->sub_shape(ff_start, ff_end);
 }
 
 bool ArrayShape::operator==(ArrayShape const &other) const {
diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/legion_dim.cc
index 49b028f227..f373cf0410 100644
--- a/lib/kernels/src/legion_dim.cc
+++ b/lib/kernels/src/legion_dim.cc
@@ -13,12 +13,6 @@ legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim,
                                       ff_dim.value.unwrap_nonnegative() - 1}};
 }
 
-ff_dim_t legion_dim_from_ff_dim(legion_dim_t legion_dim,
-                                nonnegative_int num_dimensions) {
-  return ff_dim_t{nonnegative_int{num_dimensions.unwrap_nonnegative() -
-                                  legion_dim.value.unwrap_nonnegative() - 1}};
-}
-
 ff_dim_t ff_dim_from_legion_dim(legion_dim_t legion_dim,
                                 nonnegative_int num_dimensions) {
   return ff_dim_t{nonnegative_int{num_dimensions.unwrap_nonnegative() -
diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h
index 8c2bb34130..addac74633 100644
--- a/lib/local-execution/include/local-execution/local_training_backing.h
+++ b/lib/local-execution/include/local-execution/local_training_backing.h
@@ -15,24 +15,23 @@ namespace FlexFlow {
 struct LocalTrainingBacking {
   LocalTrainingBacking(Allocator &,
                        AllocatedTensors const &,
+                       GradientTensorSource &,
                        ComputationGraph const &,
                        RuntimeArgConfig const &);
 
   LocalTrainingBacking(Allocator &,
                        AllocatedTensors const &,
+                       GradientTensorSource &,
+                       OptimizerTensorSource &,
                        ComputationGraph const &,
                        RuntimeArgConfig const &,
                        OptimizerAttrs const &);
 
 public:
-  LocalTensorBacking local_tensor_backing;
-  LocalArgsBacking local_args_backing;
-
   ComputationGraph computation_graph;
   TaskRegistry task_registry;
-
-  GradientTensorSource gradient_tensor_source;
-  OptimizerTensorSource optimizer_tensor_source;
+  LocalTensorBacking local_tensor_backing;
+  LocalArgsBacking local_args_backing;
 };
 
 LocalArgsBacking initialize_args_backing(TaskRegistry const &,
diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
index 9828a67293..532fcc91c2 100644
--- a/lib/local-execution/src/local_cost_estimator.cc
+++ b/lib/local-execution/src/local_cost_estimator.cc
@@ -82,8 +82,11 @@ CostDetails LocalCostEstimator::estimate_cost(
       std::make_shared<TrackedAllocator>(create_local_cuda_memory_allocator());
   Allocator allocator = Allocator(tracked_allocator_ptr);
 
+  GradientTensorSource gradient_tensor_source;
+
   LocalTrainingBacking local_backing(allocator,
                                      AllocatedTensors{{}, {}, {}},
+                                     gradient_tensor_source,
                                      computation_graph,
                                      this->runtime_arg_config);
   // execute layer
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index 77e62e52af..b2e0a2fb7e 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -18,20 +18,20 @@ namespace FlexFlow {
 LocalTrainingBacking::LocalTrainingBacking(
     Allocator &allocator,
     AllocatedTensors const &allocated_tensors,
+    GradientTensorSource &gradient_tensor_source,
     ComputationGraph const &computation_graph,
     RuntimeArgConfig const &runtime_arg_config)
     : computation_graph(computation_graph),
-      task_registry(construct_task_registry(
-          get_layer_attrs_mapping(this->computation_graph))),
+      task_registry(
+          construct_task_registry(get_layer_attrs_mapping(computation_graph))),
       local_tensor_backing(construct_local_tensor_backing(
           allocated_tensors,
-          generate_unallocated_tensors(
-              allocated_tensors,
-              get_all_tensor_attrs(this->computation_graph),
-              this->gradient_tensor_source),
+          generate_unallocated_tensors(allocated_tensors,
+                                       get_all_tensor_attrs(computation_graph),
+                                       gradient_tensor_source),
           allocator)),
       local_args_backing(initialize_args_backing(this->task_registry,
-                                                 this->computation_graph,
+                                                 computation_graph,
                                                  runtime_arg_config,
                                                  this->local_tensor_backing,
                                                  allocator)){};
@@ -39,23 +39,25 @@ LocalTrainingBacking::LocalTrainingBacking(
 LocalTrainingBacking::LocalTrainingBacking(
     Allocator &allocator,
     AllocatedTensors const &allocated_tensors,
+    GradientTensorSource &gradient_tensor_source,
+    OptimizerTensorSource &optimizer_tensor_source,
     ComputationGraph const &computation_graph,
     RuntimeArgConfig const &runtime_arg_config,
     OptimizerAttrs const &optimizer_attrs)
     : computation_graph(computation_graph),
-      task_registry(construct_task_registry(
-          get_layer_attrs_mapping(this->computation_graph))),
+      task_registry(
+          construct_task_registry(get_layer_attrs_mapping(computation_graph))),
       local_tensor_backing(construct_local_tensor_backing(
           allocated_tensors,
           generate_unallocated_tensors_with_optimizer(
               allocated_tensors,
-              get_all_tensor_attrs(this->computation_graph),
-              this->gradient_tensor_source,
-              this->optimizer_tensor_source,
+              get_all_tensor_attrs(computation_graph),
+              gradient_tensor_source,
+              optimizer_tensor_source,
               optimizer_attrs),
           allocator)),
       local_args_backing(initialize_args_backing(this->task_registry,
-                                                 this->computation_graph,
+                                                 computation_graph,
                                                  runtime_arg_config,
                                                  this->local_tensor_backing,
                                                  allocator)){};
diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc
index 3d9dec1e26..2787342a5f 100644
--- a/lib/local-execution/src/task_registry.cc
+++ b/lib/local-execution/src/task_registry.cc
@@ -71,6 +71,7 @@ bool registry_contains_task_for_layer(TaskRegistry const &task_registry,
           fmt::format("Invalid OpTaskType, got {}", op_task_type));
   }
 
+  assert(task_ids.count(op));
   return task_ids.at(op).has_value();
 }
 
diff --git a/lib/local-execution/test/CMakeLists.txt b/lib/local-execution/test/CMakeLists.txt
index 930ab5c4e2..a973c6967b 100644
--- a/lib/local-execution/test/CMakeLists.txt
+++ b/lib/local-execution/test/CMakeLists.txt
@@ -12,3 +12,10 @@ ff_add_test_executable(
     kernels
     op-attrs
 )
+
+set(FF_TEST_EXEC_NAME "local-execution-tests")
+add_custom_command(
+  TARGET ${FF_TEST_EXEC_NAME} POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -DFF_TEST_EXEC_NAME=${FF_TEST_EXEC_NAME} -P ${CMAKE_CURRENT_LIST_DIR}/modify_test_commands.cmake
+  DEPENDS ${FF_TEST_EXEC_NAME}
+)
diff --git a/lib/local-execution/test/modify_test_commands.cmake b/lib/local-execution/test/modify_test_commands.cmake
new file mode 100644
index 0000000000..6494ae2d78
--- /dev/null
+++ b/lib/local-execution/test/modify_test_commands.cmake
@@ -0,0 +1,21 @@
+# modify_test_commands.cmake
+
+file(GLOB ctest_tests_files "${CMAKE_CURRENT_BINARY_DIR}/${FF_TEST_EXEC_NAME}_tests-*.cmake")
+
+foreach(ctest_tests_file IN LISTS ctest_tests_files)
+  file(READ "${ctest_tests_file}" content)
+
+  # add nix run prefix
+  string(REGEX REPLACE 
+    "add_test\\([ \t\r\n]*\\[==\\[([^]]+)\\]==\\][ \t\r\n]+([^ ]+)[ \t\r\n]+\\[==\\[([^]]+)\\]==\\]\\)" 
+    "add_test( [==[\\1]==] nixGL -- \\2 [==[\\3]==])" 
+    content "${content}")
+
+  # add environment
+  # string(REGEX REPLACE 
+  #   "set_tests_properties\\([ \t\r\n]*\\[==\\[([^]]+)\\]==\\][ \t\r\n]+PROPERTIES[ \t\r\n]+([^)]+)\\)" 
+  #   "set_tests_properties( [==[\\1]==] PROPERTIES \\2 ENVIRONMENT \"NIXPKGS_ALLOW_UNFREE=1\")" 
+  #   content "${content}")
+
+  file(WRITE "${ctest_tests_file}" "${content}")
+endforeach()
diff --git a/lib/local-execution/test/src/test_loss_functions.cc b/lib/local-execution/test/src/test_loss_functions.cc
index bb3e83cc4d..2bf138e204 100644
--- a/lib/local-execution/test/src/test_loss_functions.cc
+++ b/lib/local-execution/test/src/test_loss_functions.cc
@@ -11,10 +11,14 @@
 #include "test_utils.h"
 #include "utils/containers/get_only.h"
 
-namespace FlexFlow {
+using namespace ::FlexFlow;
 
-TEST_SUITE(FF_CUDA_TEST_SUITE) {
+TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Loss Functions") {
+    // initialize runtime
+    ManagedFFStream managed_stream{};
+    ManagedPerDeviceFFHandle managed_handle{};
+
     Allocator allocator = create_local_cuda_memory_allocator();
 
     // allocate label tensors
@@ -58,44 +62,42 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         TensorDims{FFOrdered<nonnegative_int>{data_dim, output_dim}},
         DataType::FLOAT};
 
-    LayerAddedResult inputs_layer = add_layer(
-        computation_graph,
-        LayerAttrs{ComputationGraphOpAttrs{InputAttrs{input_tensor_shape}},
-                   "inputs"},
-        {},
-        {});
+    LayerAddedResult inputs_layer =
+        add_input_layer(computation_graph, input_tensor_shape);
 
     LayerAddedResult weights_layer = add_layer(
         computation_graph,
         LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
                        weight_shape, InitializerAttrs{ZeroInitializerAttrs{}}}},
-                   "weights"},
+                   std::nullopt},
         {},
         {});
 
     LayerAddedResult linear_operator = add_layer(
         computation_graph,
         LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim,
-                                                       /*use_bias=*/true,
+                                                       /*use_bias=*/false,
                                                        DataType::FLOAT,
-                                                       std::nullopt,
+                                                       Activation::RELU,
                                                        std::nullopt}},
-                   "linear"},
+                   std::nullopt},
         inputs_layer.outputs,
-        {});
+        weights_layer.outputs);
     tensor_guid_t logit_tensor = get_only(linear_operator.outputs);
 
-    // initialize runtime configs
-    ManagedPerDeviceFFHandle managed_handle{};
-
     RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
         DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
         EnableProfiling::YES,
         ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}};
 
     // initialize training backing
-    LocalTrainingBacking local_training_backing = LocalTrainingBacking{
-        allocator, allocated_tensors, computation_graph, runtime_arg_config};
+    GradientTensorSource gradient_tensor_source;
+    LocalTrainingBacking local_training_backing =
+        LocalTrainingBacking{allocator,
+                             allocated_tensors,
+                             gradient_tensor_source,
+                             computation_graph,
+                             runtime_arg_config};
 
     SUBCASE("SparseCategoricalCrossEntropyLossAttrs") {
       LossAttrs loss_attrs = LossAttrs{
@@ -141,5 +143,3 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     }
   }
 }
-
-} // namespace FlexFlow
diff --git a/lib/local-execution/test/src/test_update.cc b/lib/local-execution/test/src/test_update.cc
index d6108635af..1f8684f38a 100644
--- a/lib/local-execution/test/src/test_update.cc
+++ b/lib/local-execution/test/src/test_update.cc
@@ -9,10 +9,14 @@
 #include "pcg/optimizer_attrs.dtg.h"
 #include "test_utils.h"
 
-namespace FlexFlow {
+using namespace ::FlexFlow;
 
-TEST_SUITE(FF_CUDA_TEST_SUITE) {
+TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Execute Update") {
+    // initialize runtime configs
+    ManagedFFStream managed_stream{};
+    ManagedPerDeviceFFHandle managed_handle{};
+
     Allocator allocator = create_local_cuda_memory_allocator();
     AllocatedTensors allocated_tensors = make_empty_allocated_tensors();
 
@@ -31,12 +35,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         TensorDims{FFOrdered<nonnegative_int>{data_dim, output_dim}},
         DataType::FLOAT};
 
-    LayerAddedResult inputs_layer = add_layer(
-        computation_graph,
-        LayerAttrs{ComputationGraphOpAttrs{InputAttrs{input_tensor_shape}},
-                   "inputs"},
-        {},
-        {});
+    LayerAddedResult inputs_layer =
+        add_input_layer(computation_graph, input_tensor_shape);
 
     LayerAddedResult weights_layer = add_layer(
         computation_graph,
@@ -49,22 +49,22 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     LayerAddedResult linear_operator = add_layer(
         computation_graph,
         LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim,
-                                                       /*use_bias=*/true,
+                                                       /*use_bias=*/false,
                                                        DataType::FLOAT,
-                                                       std::nullopt,
+                                                       Activation::RELU,
                                                        std::nullopt}},
                    "linear"},
         inputs_layer.outputs,
-        {});
-
-    // initialize runtime configs
-    ManagedPerDeviceFFHandle managed_handle{};
+        weights_layer.outputs);
 
     RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
         DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
         EnableProfiling::YES,
         ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}};
 
+    GradientTensorSource gradient_tensor_source;
+    OptimizerTensorSource optimizer_tensor_source;
+
     SUBCASE("SGDOptimizerAttrs") {
       SUBCASE("momentum=0") {
         OptimizerAttrs optimizer_attrs =
@@ -75,6 +75,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         LocalTrainingBacking local_training_backing =
             LocalTrainingBacking{allocator,
                                  allocated_tensors,
+                                 gradient_tensor_source,
+                                 optimizer_tensor_source,
                                  computation_graph,
                                  runtime_arg_config,
                                  optimizer_attrs};
@@ -92,6 +94,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         LocalTrainingBacking local_training_backing =
             LocalTrainingBacking{allocator,
                                  allocated_tensors,
+                                 gradient_tensor_source,
+                                 optimizer_tensor_source,
                                  computation_graph,
                                  runtime_arg_config,
                                  optimizer_attrs};
@@ -114,6 +118,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       LocalTrainingBacking local_training_backing =
           LocalTrainingBacking{allocator,
                                allocated_tensors,
+                               gradient_tensor_source,
+                               optimizer_tensor_source,
                                computation_graph,
                                runtime_arg_config,
                                optimizer_attrs};
@@ -124,5 +130,3 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     }
   }
 }
-
-} // namespace FlexFlow
diff --git a/lib/task-spec/include/task-spec/task_arg_spec.h b/lib/task-spec/include/task-spec/task_arg_spec.h
new file mode 100644
index 0000000000..38879ecab9
--- /dev/null
+++ b/lib/task-spec/include/task-spec/task_arg_spec.h
@@ -0,0 +1,12 @@
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TASK_ARG_SPEC_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TASK_ARG_SPEC_H
+
+#include "task-spec/task_arg_spec.dtg.h"
+
+namespace FlexFlow {
+
+std::type_index get_type_index(TaskArgSpec const &);
+
+}
+
+#endif
diff --git a/lib/task-spec/src/op_task_invocation.cc b/lib/task-spec/src/op_task_invocation.cc
index d495dd9f92..a55995920a 100644
--- a/lib/task-spec/src/op_task_invocation.cc
+++ b/lib/task-spec/src/op_task_invocation.cc
@@ -79,43 +79,40 @@ OpTaskBinding infer_bwd_binding(OpTaskBinding const &fwd) {
   return bwd;
 }
 
-bool is_op_tensor_spec_invalid(OpTensorSlotSpec const &tensor_slot_spec,
-                               OpTensorSpec const &tensor_spec) {
-  return tensor_spec.role != tensor_slot_spec.tensor_role ||
-         tensor_spec.slot_option != tensor_slot_spec.slot_option;
-}
-
 bool is_tensor_invocation_valid(OpTaskSignature const &sig,
                                 OpTaskInvocation const &inv) {
-  auto tensor_bindings = inv.binding.get_tensor_bindings();
-  for (OpTensorSlotSpec const &op_tensor_slot_spec : sig.get_tensor_slots()) {
-    SlotGradId tensor_key =
-        SlotGradId{op_tensor_slot_spec.name, op_tensor_slot_spec.is_grad};
-    OpTensorSpec op_tensor_spec = tensor_bindings.at(tensor_key);
-    if (is_op_tensor_spec_invalid(op_tensor_slot_spec, op_tensor_spec)) {
+  // TODO: fix for variadic inputs (need to implement .bind() for variadic
+  // first)
+  for (std::pair<SlotGradId, OpTensorSpec> const &tensor_binding :
+       inv.binding.get_tensor_bindings()) {
+    OpTensorSlotSpec op_tensor_slot_spec =
+        OpTensorSlotSpec{tensor_binding.first.slot_id,
+                         SlotType::TENSOR,
+                         tensor_binding.second.role,
+                         tensor_binding.first.is_grad,
+                         tensor_binding.second.slot_option};
+
+    if (!sig.get_tensor_slots().count(op_tensor_slot_spec)) {
       return false;
     }
   }
 
-  // FIXME -- make sure invocation doesn't contain MORE than signature
-  // https://github.com/flexflow/FlexFlow/issues/1442
   return true;
 }
 
-bool is_arg_type_invalid(std::type_index expected_arg_type,
-                         OpArgSpec op_arg_spec) {
-  std::type_index arg_spec_type = get_op_arg_spec_type_index(op_arg_spec);
-  return arg_spec_type != expected_arg_type;
-}
-
 bool is_arg_invocation_valid(OpTaskSignature const &sig,
                              OpTaskInvocation const &inv) {
-  // FIXME -- arg signature/invocation checking
-  // https://github.com/flexflow/FlexFlow/issues/1442
-  // auto sig_arg_types = sig.get_arg_types();
-  // for (auto arg_binding : inv.binding.get_arg_bindings()) {
-  //   std::type_index arg_type = sig_arg_types.at(arg_binding.first);
-  //   assert (!is_arg_type_invalid(arg_type, arg_binding.second));
+  // TODO: fix for device specific args
+  // for (std::pair<slot_id_t, OpArgSpec> const & arg_binding :
+  // inv.binding.get_arg_bindings()) {
+  //   if (sig.get_arg_types().count(arg_binding.first)) {
+  //     if (get_op_arg_spec_type_index(arg_binding.second) !=
+  //     sig.get_arg_types().at(arg_binding.first)) {
+  //       return false;
+  //     }
+  //   } else {
+  //     return false;
+  //   }
   // }
 
   return true;
diff --git a/lib/task-spec/src/task_arg_spec.cc b/lib/task-spec/src/task_arg_spec.cc
new file mode 100644
index 0000000000..36fa2f71fd
--- /dev/null
+++ b/lib/task-spec/src/task_arg_spec.cc
@@ -0,0 +1,11 @@
+#include "task-spec/task_arg_spec.h"
+#include "utils/overload.h"
+
+namespace FlexFlow {
+
+std::type_index get_type_index(TaskArgSpec const &task_arg_spec) {
+  return task_arg_spec.visit<std::type_index>(
+      overload{[](auto const &e) { return e.get_type_index(); }});
+}
+
+} // namespace FlexFlow
diff --git a/lib/task-spec/src/task_invocation.cc b/lib/task-spec/src/task_invocation.cc
index 4ba97f26de..e182231bda 100644
--- a/lib/task-spec/src/task_invocation.cc
+++ b/lib/task-spec/src/task_invocation.cc
@@ -1,9 +1,40 @@
 #include "task-spec/task_invocation.h"
+#include "task-spec/task_arg_spec.h"
+#include "utils/containers/keys.h"
 
 namespace FlexFlow {
 
 bool is_invocation_valid(TaskSignature const &sig, TaskInvocation const &inv) {
-  NOT_IMPLEMENTED();
+  TaskBinding binding = inv.binding;
+
+  // args
+  for (std::pair<slot_id_t, TaskArgSpec> const &arg_binding :
+       binding.get_arg_bindings()) {
+    if (sig.task_arg_types.count(arg_binding.first)) {
+      if (get_type_index(arg_binding.second) !=
+          sig.task_arg_types.at(arg_binding.first)) {
+        return false; // incorrect arg type
+      }
+    } else {
+      return false; // slot doesn't exist in signature
+    }
+  }
+
+  // tensors
+  for (std::pair<SlotTensorTypeId, TensorTypeVariant> const &tensor_binding :
+       binding.get_tensor_bindings()) {
+    slot_id_t tensor_slot_id = tensor_binding.first.slot_id;
+    if (sig.tensor_guid_slots.count(tensor_slot_id)) {
+      if (tensor_binding.first.tensor_type ==
+          sig.tensor_guid_slots.at(tensor_slot_id).tensor_type) {
+        return false; // incorrect tensor type
+      }
+    } else {
+      return false; // slot doesn't exist in signature
+    }
+  }
+
+  return true;
 }
 
 } // namespace FlexFlow

From 6c84fb3feb79463d8ebe37c58833403a9b4a8b75 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 26 Feb 2025 05:19:17 -0800
Subject: [PATCH 52/91] chore: fix typo

---
 lib/realm-backend/include/realm-backend/driver.h                | 2 +-
 .../include/realm-backend/model_training_instance.h             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/realm-backend/include/realm-backend/driver.h b/lib/realm-backend/include/realm-backend/driver.h
index 77272c36ad..884b97a23d 100644
--- a/lib/realm-backend/include/realm-backend/driver.h
+++ b/lib/realm-backend/include/realm-backend/driver.h
@@ -3,7 +3,7 @@
 
 #include "realm.h"
 #include "realm/cmdline.h"
-#include "local-execution/task_invocation.h"
+#include "task-spec/op_task_invocation.h"
 
 void top_level_task(const void *args, size_t arglen, const void *userdata,
                size_t userlen, Realm::Processor p);
diff --git a/lib/realm-backend/include/realm-backend/model_training_instance.h b/lib/realm-backend/include/realm-backend/model_training_instance.h
index a35cada2d2..62d8311ccb 100644
--- a/lib/realm-backend/include/realm-backend/model_training_instance.h
+++ b/lib/realm-backend/include/realm-backend/model_training_instance.h
@@ -4,7 +4,7 @@
 #include "realm-backend/realm_training_backing.h"
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
 #include "pcg/tensor_guid_t.dtg.h"
-#include "local-execution/loss_tensor_t.dtg.h"
+#include "task-spec/loss_tensor_t.dtg.h"
 
 namespace FlexFlow {
 

From d6aa7ad7511f43ef5270901c8fe37d34c16ddd52 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 26 Feb 2025 18:12:09 -0800
Subject: [PATCH 53/91] chore: update realm allocator impl

---
 .../include/realm-backend/allocated_tensors.h |  30 ----
 .../allocated_tensors.struct.toml             |  32 ----
 .../realm-backend/model_training_instance.h   |   2 +-
 .../include/realm-backend/realm_allocator.h   |  37 +----
 .../realm_task_argument_accessor.h            |   4 +-
 .../realm-backend/realm_tensor_backing.h      |  12 +-
 .../realm-backend/realm_training_backing.h    |  17 +-
 .../unallocated_tensors.struct.toml           |  31 ----
 lib/realm-backend/src/allocated_tensors.cc    |   2 +-
 lib/realm-backend/src/realm_allocator.cc      |  29 +---
 .../src/realm_tensor_backing copy.cc          | 142 ----------------
 lib/realm-backend/src/realm_tensor_backing.cc |  22 +--
 .../src/realm_training_backing.cc             | 157 +++++++-----------
 13 files changed, 99 insertions(+), 418 deletions(-)
 delete mode 100644 lib/realm-backend/include/realm-backend/allocated_tensors.h
 delete mode 100644 lib/realm-backend/include/realm-backend/allocated_tensors.struct.toml
 delete mode 100644 lib/realm-backend/include/realm-backend/unallocated_tensors.struct.toml
 delete mode 100644 lib/realm-backend/src/realm_tensor_backing copy.cc

diff --git a/lib/realm-backend/include/realm-backend/allocated_tensors.h b/lib/realm-backend/include/realm-backend/allocated_tensors.h
deleted file mode 100644
index 8effd06954..0000000000
--- a/lib/realm-backend/include/realm-backend/allocated_tensors.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_ALLOCATED_TENSORS_H
-#define _FLEXFLOW_LOCAL_EXECUTION_ALLOCATED_TENSORS_H
-
-#include "realm-backend/allocated_tensors.dtg.h"
-#include "pcg/computation_graph.h"
-
-namespace FlexFlow {
-
-bool are_allocated_forward_tensors_valid(
-    AllocatedTensors const &,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &);
-bool are_allocated_gradient_tensors_valid(
-    AllocatedTensors const &,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &);
-bool are_allocated_optimizer_tensors_valid(
-    AllocatedTensors const &,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &);
-
-bool are_allocated_tensors_valid(
-    AllocatedTensors const &,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &);
-
-bool is_allocated_tensor_backing_valid(
-    TensorTypeVariant const &,
-    std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> const &,
-    ArrayShape const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/realm-backend/include/realm-backend/allocated_tensors.struct.toml b/lib/realm-backend/include/realm-backend/allocated_tensors.struct.toml
deleted file mode 100644
index d459027e5d..0000000000
--- a/lib/realm-backend/include/realm-backend/allocated_tensors.struct.toml
+++ /dev/null
@@ -1,32 +0,0 @@
-namespace = "FlexFlow"
-name = "AllocatedTensors"
-features = [
-  "eq",
-  "fmt",
-  "hash",
-]
-
-includes = [
-  "task-spec/tensor_type_t.dtg.h",
-  "kernels/accessor.h",
-  "realm-backend/realm_allocator.h"
-]
-
-src_includes = [
-  "utils/hash/unordered_map.h",
-  "utils/fmt/unordered_map.h",
-  "utils/hash/vector.h",
-  "utils/fmt/vector.h"
-]
-
-[[fields]]
-name = "tensor_type_backings"
-type = "std::unordered_map<::FlexFlow::TensorTypeVariant, std::pair<::FlexFlow::RealmRegion,::FlexFlow::TensorShape>>"
-
-[[fields]]
-name = "gradient_mapping"
-type = "std::unordered_map<::FlexFlow::tensor_guid_t, ::FlexFlow::gradient_tensor_t>"
-
-[[fields]]
-name = "optimizer_mapping"
-type = "std::unordered_map<::FlexFlow::tensor_guid_t, std::vector<::FlexFlow::optimizer_tensor_t>>"
diff --git a/lib/realm-backend/include/realm-backend/model_training_instance.h b/lib/realm-backend/include/realm-backend/model_training_instance.h
index 62d8311ccb..e30ae7a9a8 100644
--- a/lib/realm-backend/include/realm-backend/model_training_instance.h
+++ b/lib/realm-backend/include/realm-backend/model_training_instance.h
@@ -14,7 +14,7 @@ using PerLayerElapsedTime =
 struct ModelTrainingInstance {
   ModelTrainingInstance(RealmTrainingBacking const &,
                         tensor_guid_t const &logit_tensor,
-                        TensorShape const &label_tensor_shape,
+                        loss_tensor_t const &label_tensor,
                         LossAttrs const &,
                         OptimizerAttrs const &);
 
diff --git a/lib/realm-backend/include/realm-backend/realm_allocator.h b/lib/realm-backend/include/realm-backend/realm_allocator.h
index 1e0c7b23c4..304ca38e32 100644
--- a/lib/realm-backend/include/realm-backend/realm_allocator.h
+++ b/lib/realm-backend/include/realm-backend/realm_allocator.h
@@ -3,56 +3,31 @@
 
 #include "realm-backend/driver.h"
 #include "realm.h"
+#include "kernels/allocation.h"
 #include <realm/event.h>
 
 namespace FlexFlow {
 
 struct RealmAllocatorImpl;
 
-struct RealmRegion {
-  Realm::RegionInstance instance;
-  RealmAllocatorImpl *allocator;
-};
-
-struct RealmAllocatorImpl {
+struct RealmAllocatorImpl : public IAllocator {
   RealmAllocatorImpl() = delete;
   RealmAllocatorImpl(RealmAllocatorImpl const &) = delete;
   RealmAllocatorImpl(RealmAllocatorImpl &&) = delete;
   RealmAllocatorImpl(Realm::Processor);
   ~RealmAllocatorImpl() = default;
 
-  RealmRegion allocate(size_t);
-  void deallocate(RealmRegion);
+  void *allocate(size_t) override;
+  void deallocate(void *) override;
 
 private:
-  std::unordered_map<Realm::RegionInstance, void *> ptrs;
+  std::unordered_map<void *, Realm::RegionInstance> ptrs;
   Realm::Processor proc;
   Realm::Memory mem;
   std::vector<size_t> field_sizes = {sizeof(char)};
 };
 
-struct RealmAllocator {
-  RealmAllocator() = delete;
-
-  RealmRegion allocate(size_t);
-  void deallocate(RealmRegion);
-
-  template <typename T, typename... Args>
-  static typename std::enable_if<std::is_base_of<RealmAllocatorImpl, T>::value,
-                                 RealmAllocator>::type
-  create(Args &&...args) {
-    return RealmAllocator(std::make_shared<T>(std::forward<Args>(args)...));
-  }
-
-  RealmAllocator(std::shared_ptr<RealmAllocatorImpl> ptr) : i_allocator(ptr) {};
-  RealmAllocator(RealmAllocator const &allocator)
-      : i_allocator(allocator.i_allocator) {};
-
-private:
-  std::shared_ptr<RealmAllocatorImpl> i_allocator;
-};
-
-RealmAllocator create_realm_memory_allocator(Realm::Processor);
+Allocator create_realm_memory_allocator(Realm::Processor);
 
 } // namespace FlexFlow
 
diff --git a/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h b/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h
index ce826e162e..d5c1a63b48 100644
--- a/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h
+++ b/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h
@@ -15,7 +15,7 @@ using TensorSlotsBacking = std::unordered_map<
 using ArgSlotsBacking = std::unordered_map<slot_id_t, ConcreteArgSpec>;
 
 struct RealmTaskArgumentAccessor : public ITaskArgumentAccessor {
-  RealmTaskArgumentAccessor(RealmAllocator const &allocator,
+  RealmTaskArgumentAccessor(Allocator const &allocator,
                             TensorSlotsBacking const &tensor_slots_backing,
                             ArgSlotsBacking const &arg_slots_backing);
 
@@ -35,7 +35,7 @@ struct RealmTaskArgumentAccessor : public ITaskArgumentAccessor {
   size_t get_device_idx() const override;
 
 private:
-  RealmAllocator allocator;
+  Allocator allocator;
   TensorSlotsBacking tensor_slots_backing;
   ArgSlotsBacking arg_slots_backing;
 };
diff --git a/lib/realm-backend/include/realm-backend/realm_tensor_backing.h b/lib/realm-backend/include/realm-backend/realm_tensor_backing.h
index 25136ad2ff..dac93c84b0 100644
--- a/lib/realm-backend/include/realm-backend/realm_tensor_backing.h
+++ b/lib/realm-backend/include/realm-backend/realm_tensor_backing.h
@@ -10,21 +10,21 @@
 #include "pcg/computation_graph.dtg.h"
 #include "pcg/layer_guid_t.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
-#include "realm-backend/allocated_tensors.dtg.h"
+#include "local-execution/allocated_tensors.dtg.h"
 #include "realm-backend/realm_allocator.h"
 #include "realm-backend/realm_task_argument_accessor.h"
-#include "realm-backend/unallocated_tensors.dtg.h"
+#include "local-execution/unallocated_tensors.dtg.h"
 #include "task-spec/lowered_tensor_t.dtg.h"
 #include "task-spec/task_invocation.dtg.h"
 #include "task-spec/tensor_role.dtg.h"
 
 namespace FlexFlow {
 
-using TensorBackingMap = std::unordered_map<lowered_tensor_t, std::pair<RealmRegion, TensorShape>>;
+using TensorBackingMap = std::unordered_map<lowered_tensor_t, GenericTensorAccessorW>;
 
 struct RealmTensorBacking {
   RealmTensorBacking(AllocatedTensors const &, UnallocatedTensors const &,
-                     RealmAllocator const &);
+                     Allocator const &);
 
 public:
   GenericTensorAccessorW get_tensor(TensorTypeVariant const &) const;
@@ -45,15 +45,13 @@ struct RealmTensorBacking {
   std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
       tensor_optimizer_mapping;
 
-  RealmAllocator allocator;
+  Allocator allocator;
 
 private:
   lowered_tensor_t insert_tensor(TensorTypeVariant const &);
   LoweredTensorSource lowered_tensor_source;
 };
 
-GenericTensorAccessorW wrappup_tensor_accessor(std::pair<RealmRegion, TensorShape> const &);
-
 UnallocatedTensors generate_unallocated_tensors(
     AllocatedTensors const &,
     std::unordered_map<tensor_guid_t, TensorAttrs> const &,
diff --git a/lib/realm-backend/include/realm-backend/realm_training_backing.h b/lib/realm-backend/include/realm-backend/realm_training_backing.h
index 81df422b7a..45285464b8 100644
--- a/lib/realm-backend/include/realm-backend/realm_training_backing.h
+++ b/lib/realm-backend/include/realm-backend/realm_training_backing.h
@@ -6,7 +6,7 @@
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
 #include "pcg/computation_graph.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
-#include "realm-backend/allocated_tensors.dtg.h"
+#include "local-execution/allocated_tensors.dtg.h"
 #include "realm-backend/driver.h"
 #include "realm-backend/realm_allocator.h"
 #include "realm-backend/realm_args_backing.h"
@@ -19,20 +19,27 @@ using PerLayerElapsedTime =
     std::unordered_map<layer_guid_t, std::optional<float>>;
 
 struct RealmTrainingBacking {
-  RealmTrainingBacking(Realm::Processor, AllocatedTensors const &,
+  RealmTrainingBacking(Realm::Processor, 
+    std::vector<Realm::Processor> const &, 
+    std::vector<Allocator> const &,
+                      AllocatedTensors const &,
                        ComputationGraph const &, RuntimeArgConfig const &);
 
-  RealmTrainingBacking(Realm::Processor, AllocatedTensors const &,
+  RealmTrainingBacking(Realm::Processor, 
+    std::vector<Realm::Processor> const &, 
+    std::vector<Allocator> const &,
+    AllocatedTensors const &,
                        ComputationGraph const &, RuntimeArgConfig const &,
                        OptimizerAttrs const &);
 
 public:
   // runtime
   Realm::Processor master_proc;
+  Realm::Event master_event;
   Realm::Memory master_mem;
   std::vector<Realm::Processor> worker_procs;
-  std::unordered_map<Realm::Processor, Realm::Event> proc_events;
-  std::vector<RealmAllocator> allocators;
+  std::vector<Realm::Event> worker_events;
+  std::vector<Allocator> allocators;
 
   RealmTensorBacking realm_tensor_backing;
   RealmArgsBacking realm_args_backing;
diff --git a/lib/realm-backend/include/realm-backend/unallocated_tensors.struct.toml b/lib/realm-backend/include/realm-backend/unallocated_tensors.struct.toml
deleted file mode 100644
index e86cc2a532..0000000000
--- a/lib/realm-backend/include/realm-backend/unallocated_tensors.struct.toml
+++ /dev/null
@@ -1,31 +0,0 @@
-namespace = "FlexFlow"
-name = "UnallocatedTensors"
-features = [
-  "eq",
-  "fmt",
-  "hash",
-]
-
-includes = [
-  "task-spec/tensor_type_t.dtg.h",
-  "op-attrs/tensor_shape.dtg.h"
-]
-
-src_includes = [
-  "utils/hash/unordered_map.h",
-  "utils/fmt/unordered_map.h",
-  "utils/hash/vector.h",
-  "utils/fmt/vector.h"
-]
-
-[[fields]]
-name = "tensor_type_shapes"
-type = "std::unordered_map<::FlexFlow::TensorTypeVariant, ::FlexFlow::TensorShape>"
-
-[[fields]]
-name = "gradient_mapping"
-type = "std::unordered_map<::FlexFlow::tensor_guid_t, ::FlexFlow::gradient_tensor_t>"
-
-[[fields]]
-name = "optimizer_mapping"
-type = "std::unordered_map<::FlexFlow::tensor_guid_t, std::vector<::FlexFlow::optimizer_tensor_t>>"
diff --git a/lib/realm-backend/src/allocated_tensors.cc b/lib/realm-backend/src/allocated_tensors.cc
index f27db14643..3e249bf6d1 100644
--- a/lib/realm-backend/src/allocated_tensors.cc
+++ b/lib/realm-backend/src/allocated_tensors.cc
@@ -1,4 +1,4 @@
-#include "realm-backend/allocated_tensors.h"
+#include "local-execution/allocated_tensors.h"
 #include "pcg/optimizer_attrs.h"
 #include "utils/containers/keys.h"
 #include "utils/containers/set_union.h"
diff --git a/lib/realm-backend/src/realm_allocator.cc b/lib/realm-backend/src/realm_allocator.cc
index fadc7f5719..d7139210bc 100644
--- a/lib/realm-backend/src/realm_allocator.cc
+++ b/lib/realm-backend/src/realm_allocator.cc
@@ -15,40 +15,29 @@ RealmAllocatorImpl::RealmAllocatorImpl(Processor proc) : proc(proc) {
 }
 
 // TODO: now the region instance only corresponds to one tensor
-RealmRegion RealmAllocatorImpl::allocate(size_t requested_memory_size) {
+void *RealmAllocatorImpl::allocate(size_t requested_memory_size) {
   Rect<1> bounds(Point<1>(0), Point<1>(requested_memory_size - 1));
   RegionInstance requested_instance = RegionInstance::NO_INST;
   RegionInstance::create_instance(requested_instance, mem, bounds, field_sizes,
                                   /*SOA*/ 1, ProfilingRequestSet())
       .wait();
   void *ptr = requested_instance.pointer_untyped(0, 0);
-  this->ptrs.insert({requested_instance, ptr});
-  return {requested_instance, this};
+  this->ptrs.insert({ptr, requested_instance});
+  return ptr;
 }
 
-void RealmAllocatorImpl::deallocate(RealmRegion region) {
-  if (region.allocator == this and contains_key(this->ptrs, region.instance)) {
-    RegionInstance instance = this->ptrs.at(region.instance);
-    instance.destroy();
+void RealmAllocatorImpl::deallocate(void *ptr) {
+  if (this->ptrs.count(ptr)) {
+    RegionInstance region = this->ptrs.at(ptr);
+    region.destroy();
   } else {
     throw std::runtime_error(
         "Deallocating a pointer that was not allocated by this Allocator");
   }
 }
 
-
-/*********** RealmAllocator ***********/
-
-RealmRegion RealmAllocator::allocate(size_t mem_size) {
-  return this->i_allocator->allocate(mem_size);
-}
-
-void RealmAllocator::deallocate(RealmRegion region) {
-  this->i_allocator->deallocate(region);
-}
-
-RealmAllocator create_realm_memory_allocator(Processor proc) {
-  return RealmAllocator::create<RealmAllocatorImpl>(proc);
+Allocator create_realm_memory_allocator(Processor proc) {
+  return Allocator::create<RealmAllocatorImpl>(proc);
 }
 
 } // namespace FlexFlow
diff --git a/lib/realm-backend/src/realm_tensor_backing copy.cc b/lib/realm-backend/src/realm_tensor_backing copy.cc
deleted file mode 100644
index bac16c6b69..0000000000
--- a/lib/realm-backend/src/realm_tensor_backing copy.cc	
+++ /dev/null
@@ -1,142 +0,0 @@
-#include "task-spec/slot_grad_id.dtg.h"
-
-#include "op-attrs/parallel_tensor_shape.h"
-#include "pcg/computation_graph.h"
-#include "pcg/optimizer_attrs.h"
-#include "realm-backend/realm_allocator.h"
-#include "realm-backend/realm_tensor_backing.h"
-#include "utils/containers/contains_key.h"
-#include "utils/containers/keys.h"
-#include "utils/overload.h"
-
-namespace FlexFlow {
-
-RealmTensorBacking::RealmTensorBacking() {};
-
-void RealmTensorBacking::allocate_layer_tensors(
-    layer_guid_t const &layer_guid, ComputationGraph const &computation_graph,
-    RealmAllocator &allocator) {
-  this->allocate_tensors_by_role(TensorRole::INPUT, layer_guid,
-                                 computation_graph, allocator);
-  this->allocate_tensors_by_role(TensorRole::WEIGHT, layer_guid,
-                                 computation_graph, allocator);
-  this->allocate_tensors_by_role(TensorRole::OUTPUT, layer_guid,
-                                 computation_graph, allocator);
-}
-
-void RealmTensorBacking::allocate_tensors_by_role(
-    TensorRole const &role, layer_guid_t const &layer_guid,
-    ComputationGraph const &computation_graph, RealmAllocator &allocator) {
-  std::vector<tensor_guid_t> tensors;
-  switch (role) {
-  case TensorRole::INPUT:
-    tensors = get_incoming_inputs(computation_graph, layer_guid);
-    break;
-  case TensorRole::WEIGHT:
-    tensors = get_incoming_weights(computation_graph, layer_guid);
-    break;
-  case TensorRole::OUTPUT:
-    tensors = get_outgoing_tensors(computation_graph, layer_guid);
-    break;
-  default:
-    throw mk_runtime_error("Invalid tensor role, got {}", role);
-  }
-
-  for (tensor_guid_t const &tensor : tensors) {
-    TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor);
-    // tensor allocation
-    if (!contains_key(this->tensor_lowering_mapping, tensor)) {
-      lowered_tensor_t reduced_tensor =
-          this->lowered_tensor_source.new_lowered_tensor();
-      this->tensor_lowering_mapping.insert({tensor, reduced_tensor});
-      RealmRegion region =
-          allocator.allocate(get_size_in_bytes(tensor_attrs.shape));
-      this->tensor_regions.insert({reduced_tensor, region});
-      this->tensor_shapes.insert({reduced_tensor, tensor_attrs.shape});
-    }
-
-    // gradient tensor allocation
-    if (tensor_attrs.create_gradients == CreateGrad::YES &&
-        !contains_key(this->gradient_tensor_lowering_mapping, tensor)) {
-      lowered_tensor_t reduced_tensor =
-          this->lowered_tensor_source.new_lowered_tensor();
-      this->gradient_tensor_lowering_mapping.insert({tensor, reduced_tensor});
-      RealmRegion region =
-          allocator.allocate(get_size_in_bytes(tensor_attrs.shape));
-      this->tensor_regions.insert({reduced_tensor, region});
-      this->tensor_shapes.insert({reduced_tensor, tensor_attrs.shape});
-    }
-  }
-}
-
-void RealmTensorBacking::allocate_optimizer_tensors(
-    tensor_guid_t const &weight,
-    std::vector<optimizer_tensor_t> const &optimizer_tensors,
-    RealmAllocator &allocator) {
-  GenericTensorAccessorW weight_backing =
-      this->get_tensor_backing(this->tensor_lowering_mapping.at(weight));
-  for (optimizer_tensor_t const &optimizer_tensor : optimizer_tensors) {
-    // optimizer tensor allocation
-    if (!contains_key(this->optimizer_tensor_lowering_mapping,
-                      optimizer_tensor)) {
-      lowered_tensor_t buffer_tensor =
-          this->lowered_tensor_source.new_lowered_tensor();
-      this->optimizer_tensor_lowering_mapping.insert(
-          {optimizer_tensor, buffer_tensor});
-      TensorShape tensor_shape =
-          get_tensor_shape(weight_backing.shape, weight_backing.data_type);
-      RealmRegion region = allocator.allocate(get_size_in_bytes(tensor_shape));
-      this->tensor_regions.insert({buffer_tensor, region});
-      this->tensor_shapes.insert({buffer_tensor, tensor_shape});
-    }
-  }
-}
-
-bool RealmTensorBacking::is_tensor_allocated(
-    lowered_tensor_t const &tensor_id) const {
-  return contains_key(tensor_regions, tensor_id);
-}
-
-GenericTensorAccessorW const &RealmTensorBacking::get_tensor_backing(
-    lowered_tensor_t const &tensor_id) const {
-  void *ptr = this->tensor_regions.at(tensor_id).instance.pointer_untyped(0, 0);
-  TensorShape shape = this->tensor_shapes.at(tensor_id);
-  return {shape.data_type, ArrayShape{shape}, ptr};
-}
-
-TensorSlotsBacking RealmTensorBacking::construct_tensor_slots_backing(
-    TaskBinding const &binding) const {
-  TensorSlotsBacking mapping;
-
-  for (auto const &tensor_binding : binding.get_tensor_bindings()) {
-    SlotTensorTypeId slot_tensor_type_id = tensor_binding.first;
-
-    lowered_tensor_t tensor_id = [&] {
-      TensorTypeVariant tensor_type = tensor_binding.second;
-      if (tensor_type.has<tensor_guid_t>() and
-          slot_tensor_type_id.tensor_type == TensorType::FORWARD) {
-        return this->tensor_lowering_mapping.at(
-            tensor_type.get<tensor_guid_t>());
-      } else if (tensor_type.has<tensor_guid_t>() and
-                 slot_tensor_type_id.tensor_type == TensorType::GRADIENT) {
-        return this->gradient_tensor_lowering_mapping.at(
-            tensor_type.get<tensor_guid_t>());
-      } else if (tensor_type.has<optimizer_tensor_t>()) {
-        return this->optimizer_tensor_lowering_mapping.at(
-            tensor_type.get<optimizer_tensor_t>());
-      } else if (tensor_type.has<loss_tensor_t>()) {
-        return this->loss_tensor_lowering_mapping.at(
-            tensor_type.get<loss_tensor_t>());
-      } else {
-        throw mk_runtime_error(fmt::format("Tensor binding has invalid type"));
-      }
-    }();
-
-    GenericTensorAccessorW accessor = this->get_tensor_backing(tensor_id);
-    mapping.insert({slot_tensor_type_id, accessor});
-  }
-
-  return mapping;
-}
-
-} // namespace FlexFlow
diff --git a/lib/realm-backend/src/realm_tensor_backing.cc b/lib/realm-backend/src/realm_tensor_backing.cc
index 8f8f828821..12d0973fba 100644
--- a/lib/realm-backend/src/realm_tensor_backing.cc
+++ b/lib/realm-backend/src/realm_tensor_backing.cc
@@ -2,7 +2,7 @@
 #include "op-attrs/tensor_shape.h"
 #include "pcg/computation_graph.h"
 #include "pcg/optimizer_attrs.h"
-#include "realm-backend/allocated_tensors.h"
+#include "local-execution/allocated_tensors.h"
 #include "realm-backend/realm_allocator.h"
 #include "realm-backend/realm_tensor_backing.h"
 #include "task-spec/slot_grad_id.dtg.h"
@@ -12,23 +12,16 @@
 
 namespace FlexFlow {
 
-GenericTensorAccessorW wrappup_tensor_accessor(
-    std::pair<RealmRegion, TensorShape> const &tensor_region_shape) {
-  void *ptr = tensor_region_shape.first.instance.pointer_untyped(0, 0);
-  TensorShape shape = tensor_region_shape.second;
-  return {shape.data_type, ArrayShape{shape}, ptr};
-}
-
 RealmTensorBacking::RealmTensorBacking(
     AllocatedTensors const &allocated_tensors,
     UnallocatedTensors const &unallocated_tensors,
-    RealmAllocator const &allocator)
+    Allocator const &allocator)
     : tensor_gradient_mapping(allocated_tensors.gradient_mapping),
       tensor_optimizer_mapping(allocated_tensors.optimizer_mapping),
       allocator(allocator) {
 
   // handle already-allocated tensors
-  for (std::pair<TensorTypeVariant, std::pair<RealmRegion, TensorShape>> const
+  for (std::pair<TensorTypeVariant, GenericTensorAccessorW> const
            &tensor_type_backing : allocated_tensors.tensor_type_backings) {
     lowered_tensor_t lowered_tensor =
         this->insert_tensor(tensor_type_backing.first);
@@ -59,10 +52,9 @@ RealmTensorBacking::RealmTensorBacking(
        unallocated_tensors.tensor_type_shapes) {
     lowered_tensor_t lowered_tensor =
         this->insert_tensor(tensor_type_shape.first);
-    RealmRegion region = allocator.allocate(
-        get_size_in_bytes(tensor_type_shape.second).unwrap_nonnegative());
-    this->tensor_backings.insert(
-        {lowered_tensor, {region, tensor_type_shape.second}});
+    GenericTensorAccessorW tensor_backing =
+        this->allocator.allocate_tensor(tensor_type_shape.second);
+    this->tensor_backings.insert({lowered_tensor, tensor_backing});
   }
 };
 
@@ -117,7 +109,7 @@ RealmTensorBacking::get_tensor(TensorTypeVariant const &tensor_type) const {
             throw mk_runtime_error(
                 fmt::format("Unhandled tensor type {}", any_tensor));
           }});
-  return wrappup_tensor_accessor(this->tensor_backings.at(lowered_tensor));
+  return this->tensor_backings.at(lowered_tensor);
 }
 
 UnallocatedTensors generate_unallocated_tensors(
diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc
index f6b516e303..225a376cf3 100644
--- a/lib/realm-backend/src/realm_training_backing.cc
+++ b/lib/realm-backend/src/realm_training_backing.cc
@@ -23,26 +23,34 @@ namespace FlexFlow {
 using namespace Realm;
 
 RealmTrainingBacking::RealmTrainingBacking(
-    Processor master_proc, AllocatedTensors const &allocated_tensors,
+    Processor master_proc, std::vector<Processor> const &worker_procs,
+    std::vector<Allocator> const &allocators,
+    AllocatedTensors const &allocated_tensors,
     ComputationGraph const &computation_graph,
     RuntimeArgConfig const &runtime_arg_config)
-    : computation_graph(computation_graph),
+    : master_proc(master_proc), worker_procs(worker_procs),
+      allocators(allocators), computation_graph(computation_graph),
       task_registry(construct_task_registry(
-          get_layer_attrs_mapping(this->computation_graph)))) {
-  master_proc = master_proc;
-  proc_events.insert({master_proc, Realm::Event::NO_EVENT});
+          get_layer_attrs_mapping(this->computation_graph))),
+      realm_tensor_backing(RealmTensorBacking( // TODO: multi gpu
+          allocated_tensors,
+          generate_unallocated_tensors(
+              allocated_tensors, get_all_tensor_attrs(this->computation_graph),
+              this->gradient_tensor_source),
+          this->allocators[0])),
+      realm_args_backing(initialize_args_backing(this, runtime_arg_config)) {
+  master_event = Realm::Event::NO_EVENT;
   master_mem = Machine::MemoryQuery(Machine::get_machine())
                    .only_kind(Memory::SYSTEM_MEM)
                    .best_affinity_to(master_proc)
                    .first();
-  Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine())
-                                   .only_kind(Processor::TOC_PROC);
-  for (Processor p : pq) {
-    worker_procs.push_back(p);
-    proc_events.insert({p, Realm::Event::NO_EVENT});
-    allocators.push_back(RealmAllocator::create<RealmAllocatorImpl>(p));
+  for (Processor p : worker_procs) {
+    worker_events.push_back(Realm::Event::NO_EVENT);
   }
-  assert(worker_procs.size() > 0);
+  //   Machine::ProcessorQuery pq =
+  //   Machine::ProcessorQuery(Machine::get_machine())
+  //                                    .only_kind(Processor::TOC_PROC);
+  // allocators.push_back(create_realm_memory_allocator(p));
 
   // register tasks for realm
   for (layer_guid_t const &node :
@@ -60,41 +68,35 @@ RealmTrainingBacking::RealmTrainingBacking(
       }
     }
   }
-
-  // TODO: multi gpu
-  realm_tensor_backing = RealmTensorBacking(
-      allocated_tensors,
-      generate_unallocated_tensors(
-          allocated_tensors, get_all_tensor_attrs(this->computation_graph),
-          this->gradient_tensor_source),
-      allocators[0]);
-  realm_args_backing =
-      initialize_args_backing(this->task_registry, this->computation_graph,
-                              runtime_arg_config, this->realm_tensor_backing);
 }
 
 RealmTrainingBacking::RealmTrainingBacking(
-  Processor master_proc, AllocatedTensors const &allocated_tensors,
-  ComputationGraph const &computation_graph,
-  RuntimeArgConfig const &runtime_arg_config,
-  OptimizerAttrs const &optimizer_attrs)
-  : computation_graph(computation_graph),
-    task_registry(construct_task_registry(
-        get_layer_attrs_mapping(this->computation_graph)))) {
-  master_proc = master_proc;
-  proc_events.insert({master_proc, Realm::Event::NO_EVENT});
+    Processor master_proc, std::vector<Processor> const &worker_procs,
+    std::vector<Allocator> const &allocators,
+    AllocatedTensors const &allocated_tensors,
+    ComputationGraph const &computation_graph,
+    RuntimeArgConfig const &runtime_arg_config,
+    OptimizerAttrs const &optimizer_attrs)
+    : master_proc(master_proc), worker_procs(worker_procs),
+      allocators(allocators), computation_graph(computation_graph),
+      task_registry(construct_task_registry(
+          get_layer_attrs_mapping(this->computation_graph))),
+      realm_tensor_backing(RealmTensorBacking( // TODO: multi gpu
+          allocated_tensors,
+          generate_unallocated_tensors_with_optimizer(
+              allocated_tensors, get_all_tensor_attrs(this->computation_graph),
+              this->gradient_tensor_source, this->optimizer_tensor_source,
+              optimizer_attrs),
+          this->allocators[0])),
+      realm_args_backing(initialize_args_backing(this, runtime_arg_config)) {
+  master_event = Realm::Event::NO_EVENT;
   master_mem = Machine::MemoryQuery(Machine::get_machine())
                    .only_kind(Memory::SYSTEM_MEM)
                    .best_affinity_to(master_proc)
                    .first();
-  Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine())
-                                   .only_kind(Processor::TOC_PROC);
-  for (Processor p : pq) {
-    worker_procs.push_back(p);
-    proc_events.insert({p, Realm::Event::NO_EVENT});
-    allocators.push_back(RealmAllocator::create<RealmAllocatorImpl>(p));
+  for (Processor p : worker_procs) {
+    worker_events.push_back(Realm::Event::NO_EVENT);
   }
-  assert(worker_procs.size() > 0);
 
   // register tasks for realm
   for (layer_guid_t const &node :
@@ -112,16 +114,6 @@ RealmTrainingBacking::RealmTrainingBacking(
       }
     }
   }
-
-  // TODO: multi gpu
-  realm_tensor_backing = RealmTensorBacking(
-      allocated_tensors,
-      generate_unallocated_tensors_with_optimizer(
-          allocated_tensors, get_all_tensor_attrs(this->computation_graph),
-          this->gradient_tensor_source, this->optimizer_tensor_source,
-          optimizer_attrs),
-      allocators[0]);
-  realm_args_backing = initialize_args_backing(this, runtime_arg_config);
 }
 
 RealmArgsBacking
@@ -140,7 +132,7 @@ initialize_args_backing(RealmTrainingBacking *backing,
   Processor master_proc = backing->master_proc;
   Memory master_mem = backing->master_mem;
   std::vector<Processor> &worker_procs = backing->worker_procs;
-  std::unordered_map<Processor, Event> &proc_events = backing->proc_events;
+  std::vector<Event> &worker_events = backing->worker_events;
 
   for (layer_guid_t const &node : topological_ordering(cg)) {
     if (registry_contains_task_for_layer(task_registry, node,
@@ -164,10 +156,10 @@ initialize_args_backing(RealmTrainingBacking *backing,
       Future<DeviceSpecificDeviceStates> future = promise.get_future();
       RealmTaskArgs<DeviceSpecificDeviceStates> args{
           task_id, impl_function, accessor, std::move(promise)};
-      Event e = worker_procs[0].spawn(
-          static_cast<Processor::TaskFuncID>(task_id), &args, sizeof(args),
-          proc_events[worker_procs[0]]);
-      proc_events[worker_procs[0]] = e;
+      Event e =
+          worker_procs[0].spawn(static_cast<Processor::TaskFuncID>(task_id),
+                                &args, sizeof(args), worker_events[0]);
+      worker_events[0] = e;
       future.set_event(e);
       per_device_op_states.insert({node, std::move(future.get())});
     }
@@ -176,35 +168,6 @@ initialize_args_backing(RealmTrainingBacking *backing,
   return RealmArgsBacking{runtime_arg_config, per_device_op_states};
 }
 
-// void RealmTrainingBacking::register_and_allocate_layer(
-//     layer_guid_t const &node) {
-//   ComputationGraphOpAttrs attrs =
-//       get_layer_attrs(this->computation_graph, node).attrs;
-//   this->realm_tensor_backing.allocate_layer_tensors(
-//       node, this->computation_graph, this->allocators[0]);
-// }
-
-// void RealmTrainingBacking::allocate_layer_optimizer_tensors(
-//     layer_guid_t const &node, OptimizerAttrs const &optimizer_attrs) {
-//   ComputationGraphOpAttrs attrs =
-//       get_layer_attrs(this->computation_graph, node).attrs;
-//   if (attrs.has<WeightAttrs>()) {
-//     TaskSignature sig = get_update_signature(optimizer_attrs);
-//     tensor_guid_t weight_tensor =
-//         get_only(get_outgoing_tensors(this->computation_graph, node));
-
-//     std::vector<optimizer_tensor_t> optimizer_tensors;
-//     for (TensorTypeSlotSpec const &tensor_type_slot_spec :
-//          values(sig.tensor_guid_slots)) {
-//       optimizer_tensors.push_back(
-//           this->optimizer_tensor_source.new_optimizer_tensor());
-//     }
-//     this->layer_optimizer_tensor_ids.insert({node, optimizer_tensors});
-//     this->realm_tensor_backing.allocate_optimizer_tensors(
-//         weight_tensor, optimizer_tensors, this->allocators[0]);
-//   }
-// }
-
 Future<std::optional<float>>
 execute_forward(RealmTrainingBacking &realm_training_backing,
                 layer_guid_t const &operator_node) {
@@ -242,10 +205,8 @@ execute_forward(RealmTrainingBacking &realm_training_backing,
                                              std::move(promise)};
     Event e = realm_training_backing.worker_procs[0].spawn(
         static_cast<Processor::TaskFuncID>(task_id), &args, sizeof(args),
-        realm_training_backing
-            .proc_events[realm_training_backing.worker_procs[0]]);
-    realm_training_backing.proc_events[realm_training_backing.worker_procs[0]] =
-        e;
+        realm_training_backing.worker_events[0]);
+    realm_training_backing.worker_events[0] = e;
     future.set_event(e);
     return future;
   } else {
@@ -290,10 +251,8 @@ execute_backward(RealmTrainingBacking &realm_training_backing,
                                              std::move(promise)};
     Event e = realm_training_backing.worker_procs[0].spawn(
         static_cast<Processor::TaskFuncID>(task_id), &args, sizeof(args),
-        realm_training_backing
-            .proc_events[realm_training_backing.worker_procs[0]]);
-    realm_training_backing.proc_events[realm_training_backing.worker_procs[0]] =
-        e;
+        realm_training_backing.worker_events[0]);
+    realm_training_backing.worker_events[0] = e;
     future.set_event(e);
     return future;
   } else {
@@ -301,7 +260,7 @@ execute_backward(RealmTrainingBacking &realm_training_backing,
   }
 }
 
-Future<void> execute_update(RealmTrainingBacking const &realm_training_backing,
+Future<void> execute_update(RealmTrainingBacking &realm_training_backing,
                             layer_guid_t const &node,
                             OptimizerAttrs const &optimizer_attrs) {
   LayerAttrs layer_attrs =
@@ -341,10 +300,8 @@ Future<void> execute_update(RealmTrainingBacking const &realm_training_backing,
                              std::move(promise)};
     Event e = realm_training_backing.worker_procs[0].spawn(
         static_cast<Processor::TaskFuncID>(task_id), &args, sizeof(args),
-        realm_training_backing
-            .proc_events[realm_training_backing.worker_procs[0]]);
-    realm_training_backing.proc_events[realm_training_backing.worker_procs[0]] =
-        e;
+        realm_training_backing.worker_events[0]);
+    realm_training_backing.worker_events[0] = e;
     future.set_event(e);
     return future;
   } else {
@@ -352,7 +309,7 @@ Future<void> execute_update(RealmTrainingBacking const &realm_training_backing,
   }
 }
 
-Future<void> compute_loss(RealmTrainingBacking const &realm_training_backing,
+Future<void> compute_loss(RealmTrainingBacking &realm_training_backing,
                           LossAttrs const &loss_attrs,
                           tensor_guid_t const &logit_tensor,
                           loss_tensor_t const &label_tensor) {
@@ -377,10 +334,8 @@ Future<void> compute_loss(RealmTrainingBacking const &realm_training_backing,
                            std::move(promise)};
   Event e = realm_training_backing.worker_procs[0].spawn(
       static_cast<Processor::TaskFuncID>(task_id), &args, sizeof(args),
-      realm_training_backing
-          .proc_events[realm_training_backing.worker_procs[0]]);
-  realm_training_backing.proc_events[realm_training_backing.worker_procs[0]] =
-      e;
+      realm_training_backing.worker_events[0]);
+  realm_training_backing.worker_events[0] = e;
   future.set_event(e);
   return future;
 }

From 419cca873751ed93f9ba0887f87fa5798cad4539 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 3 Mar 2025 08:16:03 -0800
Subject: [PATCH 54/91] chore: eliminate std::optional<float>

---
 .../include/realm-backend/realm_training_backing.h        | 8 ++++----
 lib/realm-backend/src/model_training_instance.cc          | 8 ++++----
 lib/realm-backend/src/task_wrapper.cc                     | 6 +++---
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/lib/realm-backend/include/realm-backend/realm_training_backing.h b/lib/realm-backend/include/realm-backend/realm_training_backing.h
index 45285464b8..1b756b14d3 100644
--- a/lib/realm-backend/include/realm-backend/realm_training_backing.h
+++ b/lib/realm-backend/include/realm-backend/realm_training_backing.h
@@ -55,10 +55,10 @@ RealmArgsBacking initialize_args_backing(RealmTrainingBacking *,
                                         RuntimeArgConfig const &);
 
 void execute_init(RealmTrainingBacking &, layer_guid_t const &);
-Future<std::optional<float>> execute_forward(RealmTrainingBacking &,
-                                             layer_guid_t const &);
-Future<std::optional<float>> execute_backward(RealmTrainingBacking &,
-                                              layer_guid_t const &);
+Future<float> execute_forward(RealmTrainingBacking &,
+                              layer_guid_t const &);
+Future<float> execute_backward(RealmTrainingBacking &,
+                              layer_guid_t const &);
 Future<void> compute_loss(RealmTrainingBacking &, LossAttrs const &,
                           tensor_guid_t const &logit_tensor,
                           loss_tensor_t const &label_tensor);
diff --git a/lib/realm-backend/src/model_training_instance.cc b/lib/realm-backend/src/model_training_instance.cc
index acb8edb314..aa8c30b34f 100644
--- a/lib/realm-backend/src/model_training_instance.cc
+++ b/lib/realm-backend/src/model_training_instance.cc
@@ -17,7 +17,7 @@ namespace FlexFlow {
 
 PerLayerElapsedTime forward(ModelTrainingInstance &model_training_instance) {
   PerLayerElapsedTime per_layer_elapsed_time;
-  std::unordered_map<layer_guid_t, Future<std::optional<float>>>
+  std::unordered_map<layer_guid_t, Future<float>>
       per_layer_elapsed_time_future;
   for (layer_guid_t const &node : topological_ordering(
            model_training_instance.training_backing.computation_graph)) {
@@ -26,7 +26,7 @@ PerLayerElapsedTime forward(ModelTrainingInstance &model_training_instance) {
   }
   for (layer_guid_t const &node : topological_ordering(
            model_training_instance.training_backing.computation_graph)) {
-    std::optional<float> elapsed_time =
+    float elapsed_time =
         per_layer_elapsed_time_future[node].get();
     per_layer_elapsed_time.insert({node, elapsed_time});
   }
@@ -40,7 +40,7 @@ PerLayerElapsedTime backward(ModelTrainingInstance &model_training_instance) {
                model_training_instance.label_tensor);
 
   PerLayerElapsedTime per_layer_elapsed_time;
-  std::unordered_map<layer_guid_t, Future<std::optional<float>>>
+  std::unordered_map<layer_guid_t, Future<float>>
       per_layer_elapsed_time_future;
   for (layer_guid_t const &node : reversed(topological_ordering(
            model_training_instance.training_backing.computation_graph))) {
@@ -49,7 +49,7 @@ PerLayerElapsedTime backward(ModelTrainingInstance &model_training_instance) {
   }
   for (layer_guid_t const &node : reversed(topological_ordering(
            model_training_instance.training_backing.computation_graph))) {
-    std::optional<float> elapsed_time =
+    float elapsed_time =
         per_layer_elapsed_time_future[node].get();
     per_layer_elapsed_time.insert({node, elapsed_time});
   }
diff --git a/lib/realm-backend/src/task_wrapper.cc b/lib/realm-backend/src/task_wrapper.cc
index e58d2611af..ea36275462 100644
--- a/lib/realm-backend/src/task_wrapper.cc
+++ b/lib/realm-backend/src/task_wrapper.cc
@@ -16,12 +16,12 @@ void init_wrapper_task(const void *args, size_t arglen, const void *userdata,
 
 void fwdbwd_wrapper_task(const void *args, size_t arglen, const void *userdata,
                          size_t userlen, Processor p) {
-  RealmTaskArgs<std::optional<float>> const &task_args =
-      *reinterpret_cast<const RealmTaskArgs<std::optional<float>> *>(args);
+  RealmTaskArgs<float> const &task_args =
+      *reinterpret_cast<const RealmTaskArgs<float> *>(args);
   auto fn =
       task_args.impl_function.get<FwdBwdOpTaskImplFunction>().function_ptr;
   std::optional<float> result = fn(task_args.accessor);
-  task_args.promise.set_value(std::move(result));
+  task_args.promise.set_value(result.has_value() ? result.value() : 0.0f);
 }
 
 void generic_wrapper_task(const void *args, size_t arglen, const void *userdata,

From 2c0b5738e13c2671fa4b028c154cc5545f799220 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 4 Mar 2025 22:49:40 -0800
Subject: [PATCH 55/91] feat: buildable realm-backend

---
 .../include/realm-backend/task_result.h       |  8 ++-
 .../src/realm_training_backing.cc             | 62 +++++++++----------
 2 files changed, 33 insertions(+), 37 deletions(-)

diff --git a/lib/realm-backend/include/realm-backend/task_result.h b/lib/realm-backend/include/realm-backend/task_result.h
index 4cf8916f85..bac20ddd14 100644
--- a/lib/realm-backend/include/realm-backend/task_result.h
+++ b/lib/realm-backend/include/realm-backend/task_result.h
@@ -47,17 +47,18 @@ template <typename T> class Future {
 public:
   explicit Future(std::shared_ptr<SharedState<T>> state)
       : state_(std::move(state)) {}
+  explicit Future() = default;
   explicit Future(T value) : value_(std::move(value)) {}
   void set_event(Realm::Event e) { state_->set_event(e); }
   T get() {
-    value_ = state_->get_value();
-    return value_;
+    value_ = std::make_optional(state_->get_value());
+    return value_.value();
   }
   void wait() { state_->wait(); }
 
 private:
   std::shared_ptr<SharedState<T>> state_;
-  T value_;
+  std::optional<T> value_ = std::nullopt;
 };
 
 // Specialization of Future for the `void` type, as it does not carry a value.
@@ -67,6 +68,7 @@ template <> class Future<void> {
       : state_(std::move(state)) {}
   explicit Future() = default;
   void set_event(Realm::Event e) { state_->set_event(e); }
+  void get() { state_->wait(); }
   void wait() { state_->wait(); }
 
 private:
diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc
index 225a376cf3..d0b985921e 100644
--- a/lib/realm-backend/src/realm_training_backing.cc
+++ b/lib/realm-backend/src/realm_training_backing.cc
@@ -53,19 +53,16 @@ RealmTrainingBacking::RealmTrainingBacking(
   // allocators.push_back(create_realm_memory_allocator(p));
 
   // register tasks for realm
-  for (layer_guid_t const &node :
-       topological_ordering(this->computation_graph)) {
-    ComputationGraphOpAttrs attrs =
-        get_layer_attrs(this->computation_graph, node).attrs;
-    if (attrs.has<OpTaskInvocation>()) {
-      OpTaskInvocation op_task_invocation = attrs.get<OpTaskInvocation>();
-      std::vector<task_id_t> task_ids = get_task_ids(attrs);
-      for (task_id_t task_id : task_ids) {
-        TaskSignatureAndImpl task_signature_impl =
-            this->task_registry.task_mapping.at(task_id);
+  std::unordered_map<layer_guid_t, LayerAttrs> const &layer_attrs_mapping =
+      get_layer_attrs_mapping(this->computation_graph);
+  for (std::pair<layer_guid_t, LayerAttrs> const &layer_attrs :
+      layer_attrs_mapping) {
+    ComputationGraphOpAttrs attrs = layer_attrs.second.attrs;
+    std::vector<task_id_t> task_ids = get_task_ids(attrs);
+    for (task_id_t task_id : task_ids) {
+        TaskSignatureAndImpl task_signature_impl = get_task_sig_impl(task_id);
         // TODO: multi gpu
         register_wrapper_tasks(worker_procs[0], task_id, task_signature_impl);
-      }
     }
   }
 }
@@ -99,19 +96,16 @@ RealmTrainingBacking::RealmTrainingBacking(
   }
 
   // register tasks for realm
-  for (layer_guid_t const &node :
-       topological_ordering(this->computation_graph)) {
-    ComputationGraphOpAttrs attrs =
-        get_layer_attrs(this->computation_graph, node).attrs;
-    if (attrs.has<OpTaskInvocation>()) {
-      OpTaskInvocation op_task_invocation = attrs.get<OpTaskInvocation>();
-      std::vector<task_id_t> task_ids = get_task_ids(attrs);
-      for (task_id_t task_id : task_ids) {
-        TaskSignatureAndImpl task_signature_impl =
-            this->task_registry.task_mapping.at(task_id);
+  std::unordered_map<layer_guid_t, LayerAttrs> const &layer_attrs_mapping =
+      get_layer_attrs_mapping(this->computation_graph);
+  for (std::pair<layer_guid_t, LayerAttrs> const &layer_attrs :
+      layer_attrs_mapping) {
+    ComputationGraphOpAttrs attrs = layer_attrs.second.attrs;
+    std::vector<task_id_t> task_ids = get_task_ids(attrs);
+    for (task_id_t task_id : task_ids) {
+        TaskSignatureAndImpl task_signature_impl = get_task_sig_impl(task_id);
         // TODO: multi gpu
         register_wrapper_tasks(worker_procs[0], task_id, task_signature_impl);
-      }
     }
   }
 }
@@ -168,7 +162,7 @@ initialize_args_backing(RealmTrainingBacking *backing,
   return RealmArgsBacking{runtime_arg_config, per_device_op_states};
 }
 
-Future<std::optional<float>>
+Future<float>
 execute_forward(RealmTrainingBacking &realm_training_backing,
                 layer_guid_t const &operator_node) {
   if (registry_contains_task_for_layer(realm_training_backing.task_registry,
@@ -199,10 +193,10 @@ execute_forward(RealmTrainingBacking &realm_training_backing,
         realm_training_backing.task_registry.task_mapping.at(task_id)
             .impl_function;
     // TODO: multi gpu launching
-    Promise<std::optional<float>> promise(realm_training_backing.master_mem);
-    Future<std::optional<float>> future = promise.get_future();
-    RealmTaskArgs<std::optional<float>> args{task_id, impl_function, accessor,
-                                             std::move(promise)};
+    Promise<float> promise(realm_training_backing.master_mem);
+    Future<float> future = promise.get_future();
+    RealmTaskArgs<float> args{task_id, impl_function, accessor,
+                                std::move(promise)};
     Event e = realm_training_backing.worker_procs[0].spawn(
         static_cast<Processor::TaskFuncID>(task_id), &args, sizeof(args),
         realm_training_backing.worker_events[0]);
@@ -210,11 +204,11 @@ execute_forward(RealmTrainingBacking &realm_training_backing,
     future.set_event(e);
     return future;
   } else {
-    return Future<std::optional<float>>(std::nullopt);
+    return Future<float>(0.0f);
   }
 }
 
-Future<std::optional<float>>
+Future<float>
 execute_backward(RealmTrainingBacking &realm_training_backing,
                  layer_guid_t const &operator_node) {
   if (registry_contains_task_for_layer(realm_training_backing.task_registry,
@@ -245,10 +239,10 @@ execute_backward(RealmTrainingBacking &realm_training_backing,
         realm_training_backing.task_registry.task_mapping.at(task_id)
             .impl_function;
     // TODO: multi gpu launching
-    Promise<std::optional<float>> promise(realm_training_backing.master_mem);
-    Future<std::optional<float>> future = promise.get_future();
-    RealmTaskArgs<std::optional<float>> args{task_id, impl_function, accessor,
-                                             std::move(promise)};
+    Promise<float> promise(realm_training_backing.master_mem);
+    Future<float> future = promise.get_future();
+    RealmTaskArgs<float> args{task_id, impl_function, accessor,
+                                std::move(promise)};
     Event e = realm_training_backing.worker_procs[0].spawn(
         static_cast<Processor::TaskFuncID>(task_id), &args, sizeof(args),
         realm_training_backing.worker_events[0]);
@@ -256,7 +250,7 @@ execute_backward(RealmTrainingBacking &realm_training_backing,
     future.set_event(e);
     return future;
   } else {
-    return Future<std::optional<float>>(std::nullopt);
+    return Future<float>(0.0f);
   }
 }
 

From 062825e7fd04b561b84a36374a2c8df24ef220dc Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 5 Mar 2025 00:47:14 -0800
Subject: [PATCH 56/91] chore: Move realm tensor backing to dtgen

---
 .../realm-backend/model_training_instance.h   |   9 +-
 .../realm_task_argument_accessor.h            |   9 -
 .../realm-backend/realm_tensor_backing.h      |  79 +++---
 .../realm_tensor_backing.struct.toml          |  34 +++
 .../realm-backend/realm_training_backing.h    |   6 +-
 .../src/model_training_instance.cc            |  38 +--
 .../src/realm_task_argument_accessor.cc       |  31 ---
 lib/realm-backend/src/realm_tensor_backing.cc | 227 ++++--------------
 .../src/realm_training_backing.cc             |  49 ++--
 9 files changed, 172 insertions(+), 310 deletions(-)
 create mode 100644 lib/realm-backend/include/realm-backend/realm_tensor_backing.struct.toml

diff --git a/lib/realm-backend/include/realm-backend/model_training_instance.h b/lib/realm-backend/include/realm-backend/model_training_instance.h
index e30ae7a9a8..6c92b1de4a 100644
--- a/lib/realm-backend/include/realm-backend/model_training_instance.h
+++ b/lib/realm-backend/include/realm-backend/model_training_instance.h
@@ -23,11 +23,12 @@ struct ModelTrainingInstance {
   loss_tensor_t label_tensor;
   LossAttrs loss_attrs;
   OptimizerAttrs optimizer_attrs;
-};
 
-PerLayerElapsedTime forward(ModelTrainingInstance &);
-PerLayerElapsedTime backward(ModelTrainingInstance &);
-void update(ModelTrainingInstance &);
+public:
+  PerLayerElapsedTime forward();
+  PerLayerElapsedTime backward();
+  void update();
+};
 
 } // namespace FlexFlow
 
diff --git a/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h b/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h
index d5c1a63b48..256e69c301 100644
--- a/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h
+++ b/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h
@@ -40,15 +40,6 @@ struct RealmTaskArgumentAccessor : public ITaskArgumentAccessor {
   ArgSlotsBacking arg_slots_backing;
 };
 
-using TensorSlotsBackingWithoutAddresses = std::unordered_map<
-    SlotTensorTypeId,
-    std::variant<std::pair<ArrayShape, DataType>,
-                 std::vector<std::pair<ArrayShape, DataType>>>>;
-
-TensorSlotsBackingWithoutAddresses
-get_slots_backing_without_tensor_allocation_addresses(
-    TensorSlotsBacking const &);
-
 CHECK_RC_COPY_VIRTUAL_COMPLIANT(RealmTaskArgumentAccessor);
 
 } // namespace FlexFlow
diff --git a/lib/realm-backend/include/realm-backend/realm_tensor_backing.h b/lib/realm-backend/include/realm-backend/realm_tensor_backing.h
index dac93c84b0..b38815ffee 100644
--- a/lib/realm-backend/include/realm-backend/realm_tensor_backing.h
+++ b/lib/realm-backend/include/realm-backend/realm_tensor_backing.h
@@ -3,68 +3,45 @@
 #define _FLEXFLOW_REALM_BACKEND_REALM_TENSOR_BACKING_H
 
 #include "kernels/accessor.h"
+#include "local-execution/allocated_tensors.dtg.h"
 #include "local-execution/gradient_tensor_source.h"
 #include "local-execution/loss_tensor_source.h"
-#include "local-execution/lowered_tensor_source.h"
 #include "local-execution/optimizer_tensor_source.h"
+#include "local-execution/unallocated_tensors.dtg.h"
 #include "pcg/computation_graph.dtg.h"
 #include "pcg/layer_guid_t.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
-#include "local-execution/allocated_tensors.dtg.h"
 #include "realm-backend/realm_allocator.h"
 #include "realm-backend/realm_task_argument_accessor.h"
-#include "local-execution/unallocated_tensors.dtg.h"
+#include "realm-backend/realm_tensor_backing.dtg.h"
 #include "task-spec/lowered_tensor_t.dtg.h"
 #include "task-spec/task_invocation.dtg.h"
 #include "task-spec/tensor_role.dtg.h"
-
 namespace FlexFlow {
 
-using TensorBackingMap = std::unordered_map<lowered_tensor_t, GenericTensorAccessorW>;
-
-struct RealmTensorBacking {
-  RealmTensorBacking(AllocatedTensors const &, UnallocatedTensors const &,
-                     Allocator const &);
-
-public:
-  GenericTensorAccessorW get_tensor(TensorTypeVariant const &) const;
-
-public:
-  // tensors
-  TensorBackingMap tensor_backings;
-
-  std::unordered_map<tensor_guid_t, lowered_tensor_t> tensor_lowering_mapping;
-  std::unordered_map<gradient_tensor_t, lowered_tensor_t>
-      gradient_tensor_lowering_mapping;
-  std::unordered_map<optimizer_tensor_t, lowered_tensor_t>
-      optimizer_tensor_lowering_mapping;
-  std::unordered_map<loss_tensor_t, lowered_tensor_t>
-      loss_tensor_lowering_mapping;
-
-  std::unordered_map<tensor_guid_t, gradient_tensor_t> tensor_gradient_mapping;
+  GenericTensorAccessorW get_tensor(RealmTensorBacking const &,
+                                    TensorTypeVariant const &);
+  
+  std::unordered_map<TensorTypeVariant, GenericTensorAccessorW>
+      get_tensor_backings(
+          std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> const &,
+          std::unordered_map<TensorTypeVariant, TensorShape> const &,
+          Allocator &);
+  
   std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
-      tensor_optimizer_mapping;
-
-  Allocator allocator;
-
-private:
-  lowered_tensor_t insert_tensor(TensorTypeVariant const &);
-  LoweredTensorSource lowered_tensor_source;
-};
-
-UnallocatedTensors generate_unallocated_tensors(
-    AllocatedTensors const &,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &,
-    GradientTensorSource &);
-
-UnallocatedTensors generate_unallocated_tensors_with_optimizer(
-    AllocatedTensors const &,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &,
-    GradientTensorSource &, OptimizerTensorSource &, OptimizerAttrs const &);
-
-TensorSlotsBacking construct_tensor_slots_backing(RealmTensorBacking const &,
-                                                  TaskBinding const &);
-
-} // namespace FlexFlow
-
-#endif
+      merge_optimizer_mappings(
+          std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>> const
+              &allocated,
+          std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>> const
+              &unallocated);
+  
+  RealmTensorBacking construct_realm_tensor_backing(AllocatedTensors const &,
+                                                    UnallocatedTensors const &,
+                                                    Allocator &);
+  
+  TensorSlotsBacking construct_tensor_slots_backing(RealmTensorBacking const &,
+                                                    TaskBinding const &);
+  
+  } // namespace FlexFlow
+  
+  #endif
\ No newline at end of file
diff --git a/lib/realm-backend/include/realm-backend/realm_tensor_backing.struct.toml b/lib/realm-backend/include/realm-backend/realm_tensor_backing.struct.toml
new file mode 100644
index 0000000000..92a074e4fc
--- /dev/null
+++ b/lib/realm-backend/include/realm-backend/realm_tensor_backing.struct.toml
@@ -0,0 +1,34 @@
+namespace = "FlexFlow"
+name = "RealmTensorBacking"
+features = [
+  "eq",
+  "fmt",
+  "hash"
+]
+
+includes = [
+  "task-spec/tensor_type_t.dtg.h",
+  "kernels/accessor.h",
+  "pcg/tensor_guid_t.dtg.h",
+  "task-spec/gradient_tensor_t.dtg.h",
+  "task-spec/optimizer_tensor_t.dtg.h",
+]
+
+src_includes = [
+  "utils/hash/unordered_map.h",
+  "utils/fmt/unordered_map.h",
+  "utils/hash/vector.h",
+  "utils/fmt/vector.h",
+]
+
+[[fields]]
+name = "tensor_backings"
+type = "std::unordered_map<::FlexFlow::TensorTypeVariant, ::FlexFlow::GenericTensorAccessorW>"
+
+[[fields]]
+name = "tensor_gradient_mapping"
+type = "std::unordered_map<::FlexFlow::tensor_guid_t, ::FlexFlow::gradient_tensor_t>"
+
+[[fields]]
+name = "tensor_optimizer_mapping"
+type = "std::unordered_map<::FlexFlow::tensor_guid_t, std::vector<::FlexFlow::optimizer_tensor_t>>"
\ No newline at end of file
diff --git a/lib/realm-backend/include/realm-backend/realm_training_backing.h b/lib/realm-backend/include/realm-backend/realm_training_backing.h
index 1b756b14d3..ee426324cb 100644
--- a/lib/realm-backend/include/realm-backend/realm_training_backing.h
+++ b/lib/realm-backend/include/realm-backend/realm_training_backing.h
@@ -6,7 +6,8 @@
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
 #include "pcg/computation_graph.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
-#include "local-execution/allocated_tensors.dtg.h"
+#include "local-execution/allocated_tensors.h"
+#include "local-execution/unallocated_tensors.h"
 #include "realm-backend/driver.h"
 #include "realm-backend/realm_allocator.h"
 #include "realm-backend/realm_args_backing.h"
@@ -67,7 +68,8 @@ Future<void> execute_update(RealmTrainingBacking &, layer_guid_t const &,
 
 TaskArgumentAccessor get_task_arg_accessor(RealmTensorBacking const &,
                                            RealmArgsBacking const &,
-                                           TaskInvocation const &);
+                                           TaskInvocation const &,
+                                           Allocator &);
 
 } // namespace FlexFlow
 
diff --git a/lib/realm-backend/src/model_training_instance.cc b/lib/realm-backend/src/model_training_instance.cc
index aa8c30b34f..8ced02e95a 100644
--- a/lib/realm-backend/src/model_training_instance.cc
+++ b/lib/realm-backend/src/model_training_instance.cc
@@ -15,17 +15,17 @@ namespace FlexFlow {
       optimizer_attrs(optimizer_attrs), logit_tensor(logit_tensor),
       label_tensor(label_tensor){};
 
-PerLayerElapsedTime forward(ModelTrainingInstance &model_training_instance) {
+PerLayerElapsedTime ModelTrainingInstance::forward() {
   PerLayerElapsedTime per_layer_elapsed_time;
   std::unordered_map<layer_guid_t, Future<float>>
       per_layer_elapsed_time_future;
   for (layer_guid_t const &node : topological_ordering(
-           model_training_instance.training_backing.computation_graph)) {
+           this->training_backing.computation_graph)) {
     per_layer_elapsed_time_future.insert(
-        {node, execute_forward(model_training_instance.training_backing, node)});
+        {node, execute_forward(this->training_backing, node)});
   }
   for (layer_guid_t const &node : topological_ordering(
-           model_training_instance.training_backing.computation_graph)) {
+           this->training_backing.computation_graph)) {
     float elapsed_time =
         per_layer_elapsed_time_future[node].get();
     per_layer_elapsed_time.insert({node, elapsed_time});
@@ -33,22 +33,22 @@ PerLayerElapsedTime forward(ModelTrainingInstance &model_training_instance) {
   return per_layer_elapsed_time;
 }
 
-PerLayerElapsedTime backward(ModelTrainingInstance &model_training_instance) {
-  compute_loss(model_training_instance.training_backing,
-               model_training_instance.loss_attrs,
-               model_training_instance.logit_tensor,
-               model_training_instance.label_tensor);
+PerLayerElapsedTime ModelTrainingInstance::backward() {
+  compute_loss(this->training_backing,
+               this->loss_attrs,
+               this->logit_tensor,
+               this->label_tensor);
 
   PerLayerElapsedTime per_layer_elapsed_time;
   std::unordered_map<layer_guid_t, Future<float>>
       per_layer_elapsed_time_future;
   for (layer_guid_t const &node : reversed(topological_ordering(
-           model_training_instance.training_backing.computation_graph))) {
+           this->training_backing.computation_graph))) {
     per_layer_elapsed_time_future.insert(
-        {node, execute_backward(model_training_instance.training_backing, node)});
+        {node, execute_backward(this->training_backing, node)});
   }
   for (layer_guid_t const &node : reversed(topological_ordering(
-           model_training_instance.training_backing.computation_graph))) {
+           this->training_backing.computation_graph))) {
     float elapsed_time =
         per_layer_elapsed_time_future[node].get();
     per_layer_elapsed_time.insert({node, elapsed_time});
@@ -56,21 +56,21 @@ PerLayerElapsedTime backward(ModelTrainingInstance &model_training_instance) {
   return per_layer_elapsed_time;
 }
 
-void update(ModelTrainingInstance &model_training_instance) {
+void ModelTrainingInstance::update() {
   std::unordered_map<layer_guid_t, Future<void>> per_layer_update_future;
   for (layer_guid_t const &node : topological_ordering(
-           model_training_instance.training_backing.computation_graph)) {
+           this->training_backing.computation_graph)) {
     per_layer_update_future.insert(
-        {node, execute_update(model_training_instance.training_backing,
+        {node, execute_update(this->training_backing,
                    node,
-                   model_training_instance.optimizer_attrs)});
+                   this->optimizer_attrs)});
   }
   for (layer_guid_t const &node : topological_ordering(
-           model_training_instance.training_backing.computation_graph)) {
+           this->training_backing.computation_graph)) {
     per_layer_update_future[node].wait();
   }
-  model_training_instance.optimizer_attrs = get_optimizer_attrs_for_next_iter(
-    model_training_instance.optimizer_attrs);
+  this->optimizer_attrs = get_optimizer_attrs_for_next_iter(
+    this->optimizer_attrs);
 }
 
 } // namespace FlexFlow
diff --git a/lib/realm-backend/src/realm_task_argument_accessor.cc b/lib/realm-backend/src/realm_task_argument_accessor.cc
index 7b27bad6c2..c7e81da01d 100644
--- a/lib/realm-backend/src/realm_task_argument_accessor.cc
+++ b/lib/realm-backend/src/realm_task_argument_accessor.cc
@@ -57,37 +57,6 @@ Allocator RealmTaskArgumentAccessor::get_allocator() const {
   return this->allocator;
 }
 
-TensorSlotsBackingWithoutAddresses
-    get_slots_backing_without_tensor_allocation_addresses(
-        TensorSlotsBacking const &slots_backing) {
-
-  TensorSlotsBackingWithoutAddresses addressless_slots_backing;
-
-  using TensorAccessorVariant =
-      std::variant<GenericTensorAccessorW, std::vector<GenericTensorAccessorW>>;
-  for (auto const &slot_tensor : slots_backing) {
-    TensorAccessorVariant accessor_variant = slot_tensor.second;
-    std::visit(
-        overload{
-            [&](GenericTensorAccessorW const &accessor) {
-              addressless_slots_backing.insert(
-                  {slot_tensor.first, get_shape_and_datatype(accessor)});
-            },
-            [&](std::vector<GenericTensorAccessorW> const &variadic_accessor) {
-              std::vector<std::pair<ArrayShape, DataType>>
-                  variadic_addressless_accessor =
-                      transform(variadic_accessor,
-                                [](GenericTensorAccessorW const &accessor) {
-                                  return get_shape_and_datatype(accessor);
-                                });
-              addressless_slots_backing.insert(
-                  {slot_tensor.first, variadic_addressless_accessor});
-            }},
-        accessor_variant);
-  }
-  return addressless_slots_backing;
-}
-
 size_t RealmTaskArgumentAccessor::get_device_idx() const {
   return 0;
 }
diff --git a/lib/realm-backend/src/realm_tensor_backing.cc b/lib/realm-backend/src/realm_tensor_backing.cc
index 12d0973fba..5dcfa8cef8 100644
--- a/lib/realm-backend/src/realm_tensor_backing.cc
+++ b/lib/realm-backend/src/realm_tensor_backing.cc
@@ -1,9 +1,6 @@
 #include "op-attrs/parallel_tensor_shape.h"
-#include "op-attrs/tensor_shape.h"
 #include "pcg/computation_graph.h"
 #include "pcg/optimizer_attrs.h"
-#include "local-execution/allocated_tensors.h"
-#include "realm-backend/realm_allocator.h"
 #include "realm-backend/realm_tensor_backing.h"
 #include "task-spec/slot_grad_id.dtg.h"
 #include "utils/containers/contains_key.h"
@@ -12,190 +9,72 @@
 
 namespace FlexFlow {
 
-RealmTensorBacking::RealmTensorBacking(
-    AllocatedTensors const &allocated_tensors,
-    UnallocatedTensors const &unallocated_tensors,
-    Allocator const &allocator)
-    : tensor_gradient_mapping(allocated_tensors.gradient_mapping),
-      tensor_optimizer_mapping(allocated_tensors.optimizer_mapping),
-      allocator(allocator) {
-
-  // handle already-allocated tensors
-  for (std::pair<TensorTypeVariant, GenericTensorAccessorW> const
-           &tensor_type_backing : allocated_tensors.tensor_type_backings) {
-    lowered_tensor_t lowered_tensor =
-        this->insert_tensor(tensor_type_backing.first);
-    this->tensor_backings.insert({lowered_tensor, tensor_type_backing.second});
-  }
-
-  // allocate new tensors
-  this->tensor_gradient_mapping.insert(
-      unallocated_tensors.gradient_mapping.begin(),
-      unallocated_tensors.gradient_mapping.end());
+GenericTensorAccessorW
+get_tensor(RealmTensorBacking const &realm_tensor_backing,
+           TensorTypeVariant const &tensor_type) {
+  return realm_tensor_backing.tensor_backings.at(tensor_type);
+}
 
+std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
+merge_optimizer_mappings(
+    std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>> const
+        &allocated,
+    std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>> const
+        &unallocated) {
+  std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
+      merged_maps = allocated;
   for (std::pair<tensor_guid_t, std::vector<optimizer_tensor_t>> const
-           &unallocated_optimizer_tensors :
-       unallocated_tensors.optimizer_mapping) {
-    if (this->tensor_optimizer_mapping.count(
-            unallocated_optimizer_tensors.first)) {
+           &unallocated_optimizer_tensors : unallocated) {
+    if (merged_maps.count(unallocated_optimizer_tensors.first)) {
       for (optimizer_tensor_t const &optimizer_tensor :
            unallocated_optimizer_tensors.second) {
-        this->tensor_optimizer_mapping[unallocated_optimizer_tensors.first]
-            .push_back(optimizer_tensor);
+        merged_maps[unallocated_optimizer_tensors.first].push_back(
+            optimizer_tensor);
       }
     } else {
-      this->tensor_optimizer_mapping.insert({unallocated_optimizer_tensors});
+      merged_maps.insert({unallocated_optimizer_tensors});
     }
   }
-
-  for (std::pair<TensorTypeVariant, TensorShape> const &tensor_type_shape :
-       unallocated_tensors.tensor_type_shapes) {
-    lowered_tensor_t lowered_tensor =
-        this->insert_tensor(tensor_type_shape.first);
-    GenericTensorAccessorW tensor_backing =
-        this->allocator.allocate_tensor(tensor_type_shape.second);
-    this->tensor_backings.insert({lowered_tensor, tensor_backing});
-  }
-};
-
-lowered_tensor_t
-RealmTensorBacking::insert_tensor(TensorTypeVariant const &tensor_type) {
-  lowered_tensor_t lowered_tensor =
-      this->lowered_tensor_source.new_lowered_tensor();
-  tensor_type.visit<std::nullopt_t>(overload{
-      [&](tensor_guid_t const &tensor_guid) {
-        this->tensor_lowering_mapping.insert({tensor_guid, lowered_tensor});
-        return std::nullopt;
-      },
-      [&](gradient_tensor_t const &gradient_tensor) {
-        this->gradient_tensor_lowering_mapping.insert(
-            {gradient_tensor, lowered_tensor});
-        return std::nullopt;
-      },
-      [&](optimizer_tensor_t const &optimizer_tensor) {
-        this->optimizer_tensor_lowering_mapping.insert(
-            {optimizer_tensor, lowered_tensor});
-        return std::nullopt;
-      },
-      [&](loss_tensor_t const &loss_tensor) {
-        this->loss_tensor_lowering_mapping.insert(
-            {loss_tensor, lowered_tensor});
-        return std::nullopt;
-      },
-      [&](auto const &any_tensor) {
-        throw mk_runtime_error(
-            fmt::format("Unhandled tensor type {}", any_tensor));
-      }});
-  return lowered_tensor;
-}
-
-GenericTensorAccessorW
-RealmTensorBacking::get_tensor(TensorTypeVariant const &tensor_type) const {
-  lowered_tensor_t lowered_tensor =
-      tensor_type.visit<lowered_tensor_t>(overload{
-          [&](tensor_guid_t const &tensor_guid) {
-            return this->tensor_lowering_mapping.at(tensor_guid);
-          },
-          [&](gradient_tensor_t const &gradient_tensor) {
-            return this->gradient_tensor_lowering_mapping.at(gradient_tensor);
-          },
-          [&](optimizer_tensor_t const &optimizer_tensor) {
-            return this->optimizer_tensor_lowering_mapping.at(optimizer_tensor);
-          },
-          [&](loss_tensor_t const &loss_tensor) {
-            return this->loss_tensor_lowering_mapping.at(loss_tensor);
-          },
-          [&](auto const &any_tensor) {
-            throw mk_runtime_error(
-                fmt::format("Unhandled tensor type {}", any_tensor));
-          }});
-  return this->tensor_backings.at(lowered_tensor);
+  return merged_maps;
 }
 
-UnallocatedTensors generate_unallocated_tensors(
-    AllocatedTensors const &allocated_tensors,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs_mapping,
-    GradientTensorSource &gradient_tensor_source) {
-
-  assert(are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping));
-
-  std::unordered_map<TensorTypeVariant, TensorShape> tensor_type_shapes;
-  std::unordered_map<tensor_guid_t, gradient_tensor_t> gradient_mapping;
-
-  for (std::pair<tensor_guid_t, TensorAttrs> const &tensor_guid_attrs :
-       tensor_attrs_mapping) {
-    tensor_guid_t tensor_guid = tensor_guid_attrs.first;
-    TensorAttrs tensor_attrs = tensor_guid_attrs.second;
-    TensorTypeVariant tensor_guid_type = TensorTypeVariant{tensor_guid};
-    if (!allocated_tensors.tensor_type_backings.count(tensor_guid_type)) {
-      tensor_type_shapes.insert({tensor_guid_type, tensor_attrs.shape});
-    }
+std::unordered_map<TensorTypeVariant, GenericTensorAccessorW>
+get_tensor_backings(
+    std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> const
+        &tensor_type_backings,
+    std::unordered_map<TensorTypeVariant, TensorShape> const
+        &tensor_type_shapes,
+    Allocator &allocator) {
+  std::unordered_map<TensorTypeVariant, GenericTensorAccessorW>
+      all_tensor_backings = tensor_type_backings;
 
-    if (tensor_attrs.create_gradients == CreateGrad::YES &&
-        !allocated_tensors.gradient_mapping.count(tensor_guid)) {
-      gradient_tensor_t gradient_tensor =
-          gradient_tensor_source.new_gradient_tensor();
-      tensor_type_shapes.insert(
-          {TensorTypeVariant{gradient_tensor}, tensor_attrs.shape});
-      gradient_mapping.insert({tensor_guid, gradient_tensor});
-    }
+  // allocate new tensors
+  for (std::pair<TensorTypeVariant, TensorShape> const &tensor_type_shape :
+       tensor_type_shapes) {
+    GenericTensorAccessorW tensor_backing =
+        allocator.allocate_tensor(tensor_type_shape.second);
+    all_tensor_backings.insert({tensor_type_shape.first, tensor_backing});
   }
 
-  return UnallocatedTensors{tensor_type_shapes, gradient_mapping, {}};
+  return all_tensor_backings;
 }
 
-UnallocatedTensors generate_unallocated_tensors_with_optimizer(
-    AllocatedTensors const &allocated_tensors,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs_mapping,
-    GradientTensorSource &gradient_tensor_source,
-    OptimizerTensorSource &optimizer_tensor_source,
-    OptimizerAttrs const &optimizer_attrs) {
-
-  UnallocatedTensors unallocated_tensors = generate_unallocated_tensors(
-      allocated_tensors, tensor_attrs_mapping, gradient_tensor_source);
-
-  if (!get_num_optimizer_tensors(optimizer_attrs)) {
-    return unallocated_tensors;
-  }
-
-  std::unordered_map<TensorTypeVariant, TensorShape> tensor_type_shapes =
-      unallocated_tensors.tensor_type_shapes;
-  std::unordered_map<tensor_guid_t, gradient_tensor_t> gradient_mapping =
-      unallocated_tensors.gradient_mapping;
-  std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
-      optimizer_mapping;
-
-  for (std::pair<tensor_guid_t, TensorAttrs> const &tensor_guid_attrs :
-       tensor_attrs_mapping) {
-    tensor_guid_t tensor_guid = tensor_guid_attrs.first;
-    TensorAttrs tensor_attrs = tensor_guid_attrs.second;
-    if (tensor_attrs.create_gradients == CreateGrad::YES) {
-      std::vector<optimizer_tensor_t> optimizer_tensors;
-
-      int num_optimizer_tensors_to_allocate =
-          get_num_optimizer_tensors(optimizer_attrs);
-      if (allocated_tensors.optimizer_mapping.count(tensor_guid)) {
-        num_optimizer_tensors_to_allocate -=
-            allocated_tensors.optimizer_mapping.at(tensor_guid).size();
-      }
-      std::cout << num_optimizer_tensors_to_allocate;
-
-      for (int i = 0; i < num_optimizer_tensors_to_allocate; ++i) {
-        optimizer_tensor_t optimizer_tensor =
-            optimizer_tensor_source.new_optimizer_tensor();
-        optimizer_tensors.push_back(optimizer_tensor);
-        tensor_type_shapes.insert(
-            {TensorTypeVariant{optimizer_tensor}, tensor_attrs.shape});
-      }
-
-      if (num_optimizer_tensors_to_allocate > 0) {
-        optimizer_mapping.insert({tensor_guid, optimizer_tensors});
-      }
-    }
-  }
-
-  return UnallocatedTensors{tensor_type_shapes, gradient_mapping,
-                            optimizer_mapping};
+RealmTensorBacking
+construct_realm_tensor_backing(AllocatedTensors const &allocated_tensors,
+                               UnallocatedTensors const &unallocated_tensors,
+                               Allocator &allocator) {
+
+  std::unordered_map<tensor_guid_t, gradient_tensor_t> merged_gradient_maps =
+      allocated_tensors.gradient_mapping;
+  merged_gradient_maps.insert(unallocated_tensors.gradient_mapping.begin(),
+                              unallocated_tensors.gradient_mapping.end());
+
+  return RealmTensorBacking{
+      get_tensor_backings(allocated_tensors.tensor_type_backings,
+                          unallocated_tensors.tensor_type_shapes, allocator),
+      merged_gradient_maps,
+      merge_optimizer_mappings(allocated_tensors.optimizer_mapping,
+                               unallocated_tensors.optimizer_mapping)};
 }
 
 TensorSlotsBacking
@@ -206,10 +85,10 @@ construct_tensor_slots_backing(RealmTensorBacking const &realm_tensor_backing,
   for (std::pair<SlotTensorTypeId, TensorTypeVariant> const &tensor_binding :
        binding.get_tensor_bindings()) {
     mapping.insert({tensor_binding.first,
-                    realm_tensor_backing.get_tensor(tensor_binding.second)});
+                    get_tensor(realm_tensor_backing, tensor_binding.second)});
   }
 
   return mapping;
 }
 
-} // namespace FlexFlow
+} // namespace FlexFlow
\ No newline at end of file
diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc
index d0b985921e..9da921d097 100644
--- a/lib/realm-backend/src/realm_training_backing.cc
+++ b/lib/realm-backend/src/realm_training_backing.cc
@@ -1,3 +1,4 @@
+#include "kernels/allocation.h"
 #include "local-execution/loss_functions.h"
 #include "local-execution/optimizer.h"
 #include "local-execution/task_signature_impl.h"
@@ -32,12 +33,12 @@ RealmTrainingBacking::RealmTrainingBacking(
       allocators(allocators), computation_graph(computation_graph),
       task_registry(construct_task_registry(
           get_layer_attrs_mapping(this->computation_graph))),
-      realm_tensor_backing(RealmTensorBacking( // TODO: multi gpu
-          allocated_tensors,
-          generate_unallocated_tensors(
-              allocated_tensors, get_all_tensor_attrs(this->computation_graph),
-              this->gradient_tensor_source),
-          this->allocators[0])),
+      realm_tensor_backing(construct_realm_tensor_backing( // TODO: multi gpu
+        allocated_tensors,
+        generate_unallocated_tensors(
+            allocated_tensors, get_all_tensor_attrs(this->computation_graph),
+            this->gradient_tensor_source),
+        this->allocators[0])),
       realm_args_backing(initialize_args_backing(this, runtime_arg_config)) {
   master_event = Realm::Event::NO_EVENT;
   master_mem = Machine::MemoryQuery(Machine::get_machine())
@@ -78,13 +79,13 @@ RealmTrainingBacking::RealmTrainingBacking(
       allocators(allocators), computation_graph(computation_graph),
       task_registry(construct_task_registry(
           get_layer_attrs_mapping(this->computation_graph))),
-      realm_tensor_backing(RealmTensorBacking( // TODO: multi gpu
-          allocated_tensors,
-          generate_unallocated_tensors_with_optimizer(
-              allocated_tensors, get_all_tensor_attrs(this->computation_graph),
-              this->gradient_tensor_source, this->optimizer_tensor_source,
-              optimizer_attrs),
-          this->allocators[0])),
+    realm_tensor_backing(construct_realm_tensor_backing( // TODO: multi gpu
+        allocated_tensors,
+        generate_unallocated_tensors_with_optimizer(
+            allocated_tensors, get_all_tensor_attrs(this->computation_graph),
+            this->gradient_tensor_source, this->optimizer_tensor_source,
+            optimizer_attrs),
+        this->allocators[0])),
       realm_args_backing(initialize_args_backing(this, runtime_arg_config)) {
   master_event = Realm::Event::NO_EVENT;
   master_mem = Machine::MemoryQuery(Machine::get_machine())
@@ -127,6 +128,8 @@ initialize_args_backing(RealmTrainingBacking *backing,
   Memory master_mem = backing->master_mem;
   std::vector<Processor> &worker_procs = backing->worker_procs;
   std::vector<Event> &worker_events = backing->worker_events;
+  // TODO: multi gpu
+  Allocator &allocator = backing->allocators[0];
 
   for (layer_guid_t const &node : topological_ordering(cg)) {
     if (registry_contains_task_for_layer(task_registry, node,
@@ -141,7 +144,8 @@ initialize_args_backing(RealmTrainingBacking *backing,
       TaskArgumentAccessor accessor = get_task_arg_accessor(
           realm_tensor_backing,
           make_args_backing_with_empty_device_states(runtime_arg_config),
-          invocation);
+          invocation,
+          allocator);
       task_id_t task_id = invocation.task_id;
       TaskImplFunction impl_function =
           task_registry.task_mapping.at(task_id).impl_function;
@@ -187,7 +191,8 @@ execute_forward(RealmTrainingBacking &realm_training_backing,
         device_state);
     TaskArgumentAccessor accessor = get_task_arg_accessor(
         realm_training_backing.realm_tensor_backing,
-        realm_training_backing.realm_args_backing, invocation);
+        realm_training_backing.realm_args_backing, invocation,
+        realm_training_backing.allocators[0]);
     task_id_t task_id = invocation.task_id;
     TaskImplFunction impl_function =
         realm_training_backing.task_registry.task_mapping.at(task_id)
@@ -233,7 +238,8 @@ execute_backward(RealmTrainingBacking &realm_training_backing,
         device_state);
     TaskArgumentAccessor accessor = get_task_arg_accessor(
         realm_training_backing.realm_tensor_backing,
-        realm_training_backing.realm_args_backing, invocation);
+        realm_training_backing.realm_args_backing, invocation,
+        realm_training_backing.allocators[0]);
     task_id_t task_id = invocation.task_id;
     TaskImplFunction impl_function =
         realm_training_backing.task_registry.task_mapping.at(task_id)
@@ -282,7 +288,8 @@ Future<void> execute_update(RealmTrainingBacking &realm_training_backing,
     // execute update
     TaskArgumentAccessor accessor = get_task_arg_accessor(
         realm_training_backing.realm_tensor_backing,
-        realm_training_backing.realm_args_backing, invocation);
+        realm_training_backing.realm_args_backing, invocation,
+        realm_training_backing.allocators[0]);
     task_id_t task_id = invocation.task_id;
     register_wrapper_tasks_generic(realm_training_backing.worker_procs[0],
                                    task_id);
@@ -316,7 +323,8 @@ Future<void> compute_loss(RealmTrainingBacking &realm_training_backing,
   // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation));
   TaskArgumentAccessor loss_accessor = get_task_arg_accessor(
       realm_training_backing.realm_tensor_backing,
-      realm_training_backing.realm_args_backing, loss_invocation);
+      realm_training_backing.realm_args_backing, loss_invocation,
+        realm_training_backing.allocators[0]);
   task_id_t task_id = loss_invocation.task_id;
   register_wrapper_tasks_generic(realm_training_backing.worker_procs[0],
                                  task_id);
@@ -337,14 +345,15 @@ Future<void> compute_loss(RealmTrainingBacking &realm_training_backing,
 TaskArgumentAccessor
 get_task_arg_accessor(RealmTensorBacking const &realm_tensor_backing,
                       RealmArgsBacking const &realm_args_backing,
-                      TaskInvocation const &invocation) {
+                      TaskInvocation const &invocation,
+                      Allocator &allocator) {
   TensorSlotsBacking tensor_slots_backing =
       construct_tensor_slots_backing(realm_tensor_backing, invocation.binding);
   ArgSlotsBacking arg_slots_backing = construct_arg_slots_backing(
       invocation.binding, realm_args_backing.runtime_arg_config);
   // TODO: multi gpu
   return TaskArgumentAccessor::create<RealmTaskArgumentAccessor>(
-      realm_tensor_backing.allocator, tensor_slots_backing, arg_slots_backing);
+      allocator, tensor_slots_backing, arg_slots_backing);
 }
 
 } // namespace FlexFlow

From 7c53bb31a9f969d0ed72cc2bfbe3d9005be045c9 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 5 Mar 2025 01:07:15 -0800
Subject: [PATCH 57/91] chore: minor

---
 .../realm-backend/realm_training_backing.h    |  7 +++--
 .../src/realm_training_backing.cc             | 31 ++++++++++---------
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/lib/realm-backend/include/realm-backend/realm_training_backing.h b/lib/realm-backend/include/realm-backend/realm_training_backing.h
index ee426324cb..c695dc1a46 100644
--- a/lib/realm-backend/include/realm-backend/realm_training_backing.h
+++ b/lib/realm-backend/include/realm-backend/realm_training_backing.h
@@ -24,12 +24,15 @@ struct RealmTrainingBacking {
     std::vector<Realm::Processor> const &, 
     std::vector<Allocator> const &,
                       AllocatedTensors const &,
+                      GradientTensorSource &,
                        ComputationGraph const &, RuntimeArgConfig const &);
 
   RealmTrainingBacking(Realm::Processor, 
     std::vector<Realm::Processor> const &, 
     std::vector<Allocator> const &,
     AllocatedTensors const &,
+    GradientTensorSource &,
+    OptimizerTensorSource &,
                        ComputationGraph const &, RuntimeArgConfig const &,
                        OptimizerAttrs const &);
 
@@ -47,12 +50,10 @@ struct RealmTrainingBacking {
 
   ComputationGraph computation_graph;
   TaskRegistry task_registry;
-
-  GradientTensorSource gradient_tensor_source;
-  OptimizerTensorSource optimizer_tensor_source;
 };
 
 RealmArgsBacking initialize_args_backing(RealmTrainingBacking *,
+                                        ComputationGraph const &,
                                         RuntimeArgConfig const &);
 
 void execute_init(RealmTrainingBacking &, layer_guid_t const &);
diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc
index 4c50548fa9..f03f788345 100644
--- a/lib/realm-backend/src/realm_training_backing.cc
+++ b/lib/realm-backend/src/realm_training_backing.cc
@@ -27,19 +27,20 @@ RealmTrainingBacking::RealmTrainingBacking(
     Processor master_proc, std::vector<Processor> const &worker_procs,
     std::vector<Allocator> const &allocators,
     AllocatedTensors const &allocated_tensors,
+    GradientTensorSource &gradient_tensor_source,
     ComputationGraph const &computation_graph,
     RuntimeArgConfig const &runtime_arg_config)
     : master_proc(master_proc), worker_procs(worker_procs),
       allocators(allocators), computation_graph(computation_graph),
       task_registry(construct_task_registry(
-          get_layer_attrs_mapping(this->computation_graph))),
+          get_layer_attrs_mapping(computation_graph))),
       realm_tensor_backing(construct_realm_tensor_backing( // TODO: multi gpu
         allocated_tensors,
         generate_unallocated_tensors(
-            allocated_tensors, get_all_tensor_attrs(this->computation_graph),
-            this->gradient_tensor_source),
+            allocated_tensors, get_all_tensor_attrs(computation_graph),
+            gradient_tensor_source),
         this->allocators[0])),
-      realm_args_backing(initialize_args_backing(this, runtime_arg_config)) {
+      realm_args_backing(initialize_args_backing(this, computation_graph, runtime_arg_config)) {
   master_event = Realm::Event::NO_EVENT;
   master_mem = Machine::MemoryQuery(Machine::get_machine())
                    .only_kind(Memory::SYSTEM_MEM)
@@ -58,7 +59,7 @@ RealmTrainingBacking::RealmTrainingBacking(
       get_layer_attrs_mapping(this->computation_graph);
   for (std::pair<layer_guid_t, LayerAttrs> const &layer_attrs :
       layer_attrs_mapping) {
-    ComputationGraphOpAttrs attrs = layer_attrs.second.attrs;
+    ComputationGraphOpAttrs attrs = layer_attrs.second.op_attrs;
     std::vector<task_id_t> task_ids = get_task_ids(attrs);
     for (task_id_t task_id : task_ids) {
         TaskSignatureAndImpl task_signature_impl = get_task_sig_impl(task_id);
@@ -72,21 +73,23 @@ RealmTrainingBacking::RealmTrainingBacking(
     Processor master_proc, std::vector<Processor> const &worker_procs,
     std::vector<Allocator> const &allocators,
     AllocatedTensors const &allocated_tensors,
+    GradientTensorSource &gradient_tensor_source,
+    OptimizerTensorSource &optimizer_tensor_source,
     ComputationGraph const &computation_graph,
     RuntimeArgConfig const &runtime_arg_config,
     OptimizerAttrs const &optimizer_attrs)
     : master_proc(master_proc), worker_procs(worker_procs),
       allocators(allocators), computation_graph(computation_graph),
       task_registry(construct_task_registry(
-          get_layer_attrs_mapping(this->computation_graph))),
+          get_layer_attrs_mapping(computation_graph))),
     realm_tensor_backing(construct_realm_tensor_backing( // TODO: multi gpu
         allocated_tensors,
         generate_unallocated_tensors_with_optimizer(
-            allocated_tensors, get_all_tensor_attrs(this->computation_graph),
-            this->gradient_tensor_source, this->optimizer_tensor_source,
+            allocated_tensors, get_all_tensor_attrs(computation_graph),
+            gradient_tensor_source, optimizer_tensor_source,
             optimizer_attrs),
         this->allocators[0])),
-      realm_args_backing(initialize_args_backing(this, runtime_arg_config)) {
+      realm_args_backing(initialize_args_backing(this, computation_graph, runtime_arg_config)) {
   master_event = Realm::Event::NO_EVENT;
   master_mem = Machine::MemoryQuery(Machine::get_machine())
                    .only_kind(Memory::SYSTEM_MEM)
@@ -101,7 +104,7 @@ RealmTrainingBacking::RealmTrainingBacking(
       get_layer_attrs_mapping(this->computation_graph);
   for (std::pair<layer_guid_t, LayerAttrs> const &layer_attrs :
       layer_attrs_mapping) {
-    ComputationGraphOpAttrs attrs = layer_attrs.second.attrs;
+    ComputationGraphOpAttrs attrs = layer_attrs.second.op_attrs;
     std::vector<task_id_t> task_ids = get_task_ids(attrs);
     for (task_id_t task_id : task_ids) {
         TaskSignatureAndImpl task_signature_impl = get_task_sig_impl(task_id);
@@ -113,6 +116,7 @@ RealmTrainingBacking::RealmTrainingBacking(
 
 RealmArgsBacking
 initialize_args_backing(RealmTrainingBacking *backing,
+                        ComputationGraph const &cg,
                         RuntimeArgConfig const &runtime_arg_config) {
   // initialize_args_backing(TaskRegistry const &task_registry,
   //                         ComputationGraph const &cg,
@@ -121,7 +125,6 @@ initialize_args_backing(RealmTrainingBacking *backing,
   std::unordered_map<layer_guid_t, DeviceSpecificDeviceStates>
       per_device_op_states;
   TaskRegistry const &task_registry = backing->task_registry;
-  ComputationGraph const &cg = backing->computation_graph;
   RealmTensorBacking const &realm_tensor_backing =
       backing->realm_tensor_backing;
   Processor master_proc = backing->master_proc;
@@ -134,7 +137,7 @@ initialize_args_backing(RealmTrainingBacking *backing,
   for (layer_guid_t const &node : topological_ordering(cg)) {
     if (registry_contains_task_for_layer(task_registry, node,
                                          OpTaskType::INIT)) {
-      ComputationGraphOpAttrs attrs = get_layer_attrs(cg, node).attrs;
+      ComputationGraphOpAttrs attrs = get_layer_attrs(cg, node).op_attrs;
 
       TaskInvocation invocation = lower_to_task_invocation(
           init(attrs), node, get_incoming_inputs(cg, node),
@@ -173,7 +176,7 @@ execute_forward(RealmTrainingBacking &realm_training_backing,
                                        operator_node, OpTaskType::FWD)) {
     ComputationGraphOpAttrs attrs =
         get_layer_attrs(realm_training_backing.computation_graph, operator_node)
-            .attrs;
+            .op_attrs;
     std::optional<DeviceSpecificDeviceStates> device_state =
         get_per_device_op_state_if_exists(
             realm_training_backing.realm_args_backing, operator_node);
@@ -220,7 +223,7 @@ execute_backward(RealmTrainingBacking &realm_training_backing,
                                        operator_node, OpTaskType::BWD)) {
     ComputationGraphOpAttrs attrs =
         get_layer_attrs(realm_training_backing.computation_graph, operator_node)
-            .attrs;
+            .op_attrs;
     std::optional<DeviceSpecificDeviceStates> device_state =
         get_per_device_op_state_if_exists(
             realm_training_backing.realm_args_backing, operator_node);

From bf57d1dfe2e6f4fbabeb2170f386bd399a080c7c Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 5 Mar 2025 01:17:51 -0800
Subject: [PATCH 58/91] chore: remove deprecated file

---
 lib/realm-backend/src/allocated_tensors.cc | 141 ---------------------
 1 file changed, 141 deletions(-)
 delete mode 100644 lib/realm-backend/src/allocated_tensors.cc

diff --git a/lib/realm-backend/src/allocated_tensors.cc b/lib/realm-backend/src/allocated_tensors.cc
deleted file mode 100644
index 3e249bf6d1..0000000000
--- a/lib/realm-backend/src/allocated_tensors.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-#include "local-execution/allocated_tensors.h"
-#include "pcg/optimizer_attrs.h"
-#include "utils/containers/keys.h"
-#include "utils/containers/set_union.h"
-
-namespace FlexFlow {
-
-bool is_allocated_tensor_backing_valid(
-    TensorTypeVariant const &tensor_type,
-    std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> const
-        &allocated_tensor_backings,
-    ArrayShape const &expected_shape) {
-  if (allocated_tensor_backings.count(tensor_type)) {
-    GenericTensorAccessorW tensor_backing =
-        allocated_tensor_backings.at(tensor_type);
-    if (expected_shape == tensor_backing.shape) {
-      return true;
-    }
-  }
-  return false;
-};
-
-bool are_allocated_forward_tensors_valid(
-    AllocatedTensors const &allocated_tensors,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs) {
-
-  std::unordered_set<tensor_guid_t> all_tensor_guids = transform(
-      keys(filter_keys(
-          allocated_tensors.tensor_type_backings,
-          [&](TensorTypeVariant const &k) { return k.has<tensor_guid_t>(); })),
-      [&](TensorTypeVariant const &t) { return t.get<tensor_guid_t>(); });
-
-  for (tensor_guid_t const &tensor_guid : all_tensor_guids) {
-    if (tensor_attrs.count(tensor_guid)) {
-      if (!is_allocated_tensor_backing_valid(
-              TensorTypeVariant{tensor_guid},
-              allocated_tensors.tensor_type_backings,
-              ArrayShape{tensor_attrs.at(tensor_guid).shape})) {
-        return false;
-      }
-    } else {
-      return false;
-    }
-  }
-  return true;
-}
-
-bool are_allocated_gradient_tensors_valid(
-    AllocatedTensors const &allocated_tensors,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs) {
-  std::unordered_set<TensorTypeVariant>
-      tensors_in_mappings; // will check for dangling gradient tensors
-
-  for (std::pair<tensor_guid_t, gradient_tensor_t> const &tensor_to_grad :
-       allocated_tensors.gradient_mapping) {
-    if (tensor_attrs.count(tensor_to_grad.first)) {
-      if (tensor_attrs.at(tensor_to_grad.first).create_gradients ==
-          CreateGrad::NO) {
-        return false;
-      }
-
-      ArrayShape tensor_guid_array_shape =
-          ArrayShape{tensor_attrs.at(tensor_to_grad.first).shape};
-      TensorTypeVariant gradient_tensor =
-          TensorTypeVariant{tensor_to_grad.second};
-      if (is_allocated_tensor_backing_valid(
-              gradient_tensor,
-              allocated_tensors.tensor_type_backings,
-              tensor_guid_array_shape)) {
-        tensors_in_mappings.insert(gradient_tensor);
-      } else {
-        return false;
-      }
-    } else {
-      return false;
-    }
-  }
-
-  for (TensorTypeVariant const &tensor_type :
-       keys(allocated_tensors.tensor_type_backings)) {
-    if (tensor_type.has<gradient_tensor_t>()) {
-      if (!tensors_in_mappings.count(tensor_type)) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-bool are_allocated_optimizer_tensors_valid(
-    AllocatedTensors const &allocated_tensors,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs) {
-  std::unordered_set<TensorTypeVariant>
-      tensors_in_mappings; // will check for dangling optimizer tensors
-
-  for (std::pair<tensor_guid_t, std::vector<optimizer_tensor_t>> const
-           &tensor_to_optimizers : allocated_tensors.optimizer_mapping) {
-    if (tensor_attrs.count(tensor_to_optimizers.first)) {
-      if (tensor_attrs.at(tensor_to_optimizers.first).create_gradients ==
-          CreateGrad::NO) {
-        return false;
-      }
-
-      ArrayShape tensor_guid_array_shape =
-          ArrayShape{tensor_attrs.at(tensor_to_optimizers.first).shape};
-      for (optimizer_tensor_t const &optimizer_tensor :
-           tensor_to_optimizers.second) {
-        if (is_allocated_tensor_backing_valid(
-                TensorTypeVariant{optimizer_tensor},
-                allocated_tensors.tensor_type_backings,
-                tensor_guid_array_shape)) {
-          tensors_in_mappings.insert(TensorTypeVariant{optimizer_tensor});
-        } else {
-          return false;
-        }
-      }
-    }
-  }
-
-  for (TensorTypeVariant const &tensor_type :
-       keys(allocated_tensors.tensor_type_backings)) {
-    if (tensor_type.has<optimizer_tensor_t>()) {
-      if (!tensors_in_mappings.count(tensor_type)) {
-        return false;
-      }
-    }
-  }
-
-  return true;
-}
-
-bool are_allocated_tensors_valid(
-    AllocatedTensors const &allocated_tensors,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs) {
-  return are_allocated_forward_tensors_valid(allocated_tensors, tensor_attrs) &&
-         are_allocated_gradient_tensors_valid(allocated_tensors,
-                                              tensor_attrs) &&
-         are_allocated_optimizer_tensors_valid(allocated_tensors, tensor_attrs);
-}
-
-} // namespace FlexFlow

From 3a0d4e85b2937de8ab8a97bfae688c4f8a0808ea Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 12 Mar 2025 12:02:49 -0700
Subject: [PATCH 59/91] feat: add a unit test for realm backend

---
 .proj.toml                                    |   2 +-
 lib/realm-backend/CMakeLists.txt              |   2 +-
 .../src/realm_training_backing.cc             |   4 -
 lib/realm-backend/test/CMakeLists.txt         |   8 +-
 .../test/modify_test_commands.cmake           |  21 +++
 lib/realm-backend/test/src/test_update.cc     | 120 ++++++++++++++++++
 lib/realm-backend/test/src/test_utils.cc      |  19 +++
 lib/realm-backend/test/src/test_utils.h       |  23 ++++
 8 files changed, 192 insertions(+), 7 deletions(-)
 create mode 100644 lib/realm-backend/test/modify_test_commands.cmake
 create mode 100644 lib/realm-backend/test/src/test_update.cc
 create mode 100644 lib/realm-backend/test/src/test_utils.cc
 create mode 100644 lib/realm-backend/test/src/test_utils.h

diff --git a/.proj.toml b/.proj.toml
index 66caad7e4c..3581b3b2c7 100644
--- a/.proj.toml
+++ b/.proj.toml
@@ -28,7 +28,7 @@ test_targets = [
   "compiler-tests",
   "substitution-generator-tests",
   "local-execution-tests",
-  #"realm-backend-tests",
+  "realm-backend-tests",
   "models-tests",
 ]
 
diff --git a/lib/realm-backend/CMakeLists.txt b/lib/realm-backend/CMakeLists.txt
index 436d8cc8b0..623816567e 100644
--- a/lib/realm-backend/CMakeLists.txt
+++ b/lib/realm-backend/CMakeLists.txt
@@ -17,4 +17,4 @@ ff_add_library(
     legion
 )
 
-# add_subdirectory(test)
+add_subdirectory(test)
diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc
index f03f788345..17463ec4ec 100644
--- a/lib/realm-backend/src/realm_training_backing.cc
+++ b/lib/realm-backend/src/realm_training_backing.cc
@@ -49,10 +49,6 @@ RealmTrainingBacking::RealmTrainingBacking(
   for (Processor p : worker_procs) {
     worker_events.push_back(Realm::Event::NO_EVENT);
   }
-  //   Machine::ProcessorQuery pq =
-  //   Machine::ProcessorQuery(Machine::get_machine())
-  //                                    .only_kind(Processor::TOC_PROC);
-  // allocators.push_back(create_realm_memory_allocator(p));
 
   // register tasks for realm
   std::unordered_map<layer_guid_t, LayerAttrs> const &layer_attrs_mapping =
diff --git a/lib/realm-backend/test/CMakeLists.txt b/lib/realm-backend/test/CMakeLists.txt
index 965f2e04b2..e180208fbc 100644
--- a/lib/realm-backend/test/CMakeLists.txt
+++ b/lib/realm-backend/test/CMakeLists.txt
@@ -6,9 +6,15 @@ ff_add_test_executable(
   PRIVATE_INCLUDE 
     src/
   DEPS
-    doctest
     utils-test-common
     realm-backend
     kernels
     op-attrs
 )
+
+set(FF_TEST_EXEC_NAME "realm-backend-tests")
+add_custom_command(
+  TARGET ${FF_TEST_EXEC_NAME} POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -DFF_TEST_EXEC_NAME=${FF_TEST_EXEC_NAME} -P ${CMAKE_CURRENT_LIST_DIR}/modify_test_commands.cmake
+  DEPENDS ${FF_TEST_EXEC_NAME}
+)
diff --git a/lib/realm-backend/test/modify_test_commands.cmake b/lib/realm-backend/test/modify_test_commands.cmake
new file mode 100644
index 0000000000..6494ae2d78
--- /dev/null
+++ b/lib/realm-backend/test/modify_test_commands.cmake
@@ -0,0 +1,21 @@
+# modify_test_commands.cmake
+
+file(GLOB ctest_tests_files "${CMAKE_CURRENT_BINARY_DIR}/${FF_TEST_EXEC_NAME}_tests-*.cmake")
+
+foreach(ctest_tests_file IN LISTS ctest_tests_files)
+  file(READ "${ctest_tests_file}" content)
+
+  # add nix run prefix
+  string(REGEX REPLACE 
+    "add_test\\([ \t\r\n]*\\[==\\[([^]]+)\\]==\\][ \t\r\n]+([^ ]+)[ \t\r\n]+\\[==\\[([^]]+)\\]==\\]\\)" 
+    "add_test( [==[\\1]==] nixGL -- \\2 [==[\\3]==])" 
+    content "${content}")
+
+  # add environment
+  # string(REGEX REPLACE 
+  #   "set_tests_properties\\([ \t\r\n]*\\[==\\[([^]]+)\\]==\\][ \t\r\n]+PROPERTIES[ \t\r\n]+([^)]+)\\)" 
+  #   "set_tests_properties( [==[\\1]==] PROPERTIES \\2 ENVIRONMENT \"NIXPKGS_ALLOW_UNFREE=1\")" 
+  #   content "${content}")
+
+  file(WRITE "${ctest_tests_file}" "${content}")
+endforeach()
diff --git a/lib/realm-backend/test/src/test_update.cc b/lib/realm-backend/test/src/test_update.cc
new file mode 100644
index 0000000000..1023399c8a
--- /dev/null
+++ b/lib/realm-backend/test/src/test_update.cc
@@ -0,0 +1,120 @@
+#include "kernels/managed_ff_stream.h"
+#include "kernels/managed_per_device_ff_handle.h"
+#include "local-execution/allocated_tensors.h"
+#include "pcg/computation_graph.h"
+#include "pcg/computation_graph_builder.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "realm-backend/driver.h"
+#include "realm-backend/realm_allocator.h"
+#include "realm-backend/realm_training_backing.h"
+#include "test_utils.h"
+
+using namespace ::FlexFlow;
+using namespace Realm;
+
+void top_level_task(const void *args, size_t arglen, const void *userdata,
+                    size_t userlen, Realm::Processor p) {
+  // initialize runtime configs
+  ManagedFFStream managed_stream{};
+  ManagedPerDeviceFFHandle managed_handle{};
+  std::vector<Processor> worker_procs;
+  std::vector<Allocator> allocators;
+  Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine())
+                                   .only_kind(Processor::TOC_PROC);
+  for (Processor p : pq) {
+    worker_procs.push_back(p);
+    allocators.push_back(create_realm_memory_allocator(p));
+  }
+
+  AllocatedTensors allocated_tensors = make_empty_allocated_tensors();
+
+  // construct computation graph
+  ComputationGraph computation_graph = make_empty_computation_graph();
+
+  nonnegative_int batch_size = 10_n;
+  nonnegative_int data_dim = 16_n;
+  nonnegative_int output_dim = 32_n;
+
+  TensorShape input_tensor_shape =
+      TensorShape{TensorDims{FFOrdered<nonnegative_int>{batch_size, data_dim}},
+                  DataType::FLOAT};
+
+  TensorShape weight_shape =
+      TensorShape{TensorDims{FFOrdered<nonnegative_int>{data_dim, output_dim}},
+                  DataType::FLOAT};
+
+  LayerAddedResult inputs_layer =
+      add_input_layer(computation_graph, input_tensor_shape);
+
+  LayerAddedResult weights_layer = add_layer(
+      computation_graph,
+      LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
+                     weight_shape, InitializerAttrs{ZeroInitializerAttrs{}}}},
+                 "weights"},
+      {}, {});
+
+  LayerAddedResult linear_operator =
+      add_layer(computation_graph,
+                LayerAttrs{ComputationGraphOpAttrs{
+                               LinearAttrs{output_dim,
+                                           /*use_bias=*/false, DataType::FLOAT,
+                                           Activation::RELU, std::nullopt}},
+                           "linear"},
+                inputs_layer.outputs, weights_layer.outputs);
+
+  RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
+      DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
+      EnableProfiling::YES,
+      ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}};
+
+  GradientTensorSource gradient_tensor_source;
+  OptimizerTensorSource optimizer_tensor_source;
+
+  int test_id = 0;
+
+  {
+    printf("Running test %d: SGDOptimizerAttrs, momentum=0\n", ++test_id);
+    OptimizerAttrs optimizer_attrs =
+        OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                        /*momentum=*/0.0f,
+                                        /*nesterov=*/false,
+                                        /*weight_decay=*/0.001}};
+    RealmTrainingBacking realm_training_backing = RealmTrainingBacking(
+        p, worker_procs, allocators, allocated_tensors, gradient_tensor_source,
+        optimizer_tensor_source, computation_graph, runtime_arg_config,
+        optimizer_attrs);
+    execute_update(realm_training_backing, linear_operator.layer, optimizer_attrs);
+  }
+
+  {
+    printf("Running test %d: SGDOptimizerAttrs, momentum=0.9\n", ++test_id);
+    OptimizerAttrs optimizer_attrs =
+        OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                        /*momentum=*/0.9,
+                                        /*nesterov=*/false,
+                                        /*weight_decay=*/0.001}};
+    RealmTrainingBacking realm_training_backing = RealmTrainingBacking(
+        p, worker_procs, allocators, allocated_tensors, gradient_tensor_source,
+        optimizer_tensor_source, computation_graph, runtime_arg_config,
+        optimizer_attrs);
+    execute_update(realm_training_backing, linear_operator.layer, optimizer_attrs);
+  }
+  
+  {
+    printf("Running test %d: AdamOptimizerAttrs\n", ++test_id);
+    OptimizerAttrs optimizer_attrs =
+        OptimizerAttrs{AdamOptimizerAttrs{/*alpha=*/0.001,
+                                        /*beta1=*/0.9,
+                                        /*beta2=*/0.999,
+                                        /*weight_decay=*/0.001,
+                                        /*alpha_t=*/0.001,
+                                        /*beta_t=*/0.9,
+                                        /*beta2_t=*/0.999,
+                                        /*epsilon=*/1e-8}};
+    RealmTrainingBacking realm_training_backing = RealmTrainingBacking(
+        p, worker_procs, allocators, allocated_tensors, gradient_tensor_source,
+        optimizer_tensor_source, computation_graph, runtime_arg_config,
+        optimizer_attrs);
+    execute_update(realm_training_backing, linear_operator.layer, optimizer_attrs);
+  }
+}
diff --git a/lib/realm-backend/test/src/test_utils.cc b/lib/realm-backend/test/src/test_utils.cc
new file mode 100644
index 0000000000..b7a4e16b97
--- /dev/null
+++ b/lib/realm-backend/test/src/test_utils.cc
@@ -0,0 +1,19 @@
+#include "test_utils.h"
+#include "pcg/tensor_guid_t.dtg.h"
+
+namespace FlexFlow {
+
+PerDeviceFFHandle get_mock_per_device_ff_handle() {
+  return {nullptr, nullptr, nullptr, 0, false};
+}
+
+size_t MockTensorGuidSource::next_available_mock_tensor_guid = 0;
+
+MockTensorGuidSource::MockTensorGuidSource() {}
+
+tensor_guid_t MockTensorGuidSource::new_mock_tensor_guid() {
+  size_t next_guid = MockTensorGuidSource::next_available_mock_tensor_guid++;
+  return tensor_guid_t{DataflowOutput{Node{0}, nonnegative_int{next_guid}}};
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-backend/test/src/test_utils.h b/lib/realm-backend/test/src/test_utils.h
new file mode 100644
index 0000000000..056e92687c
--- /dev/null
+++ b/lib/realm-backend/test/src/test_utils.h
@@ -0,0 +1,23 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_TEST_UTILS
+#define _FLEXFLOW_LOCAL_EXECUTION_TEST_UTILS
+
+#include "kernels/ff_handle.h"
+#include "pcg/tensor_guid_t.dtg.h"
+
+namespace FlexFlow {
+
+struct MockTensorGuidSource {
+public:
+  MockTensorGuidSource();
+
+  tensor_guid_t new_mock_tensor_guid();
+
+private:
+  static size_t next_available_mock_tensor_guid;
+};
+
+PerDeviceFFHandle get_mock_per_device_ff_handle();
+
+} // namespace FlexFlow
+
+#endif

From fa3f9173b1148d3a0ba5b163e1c405f6e3bc7f59 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 16 Mar 2025 09:59:53 -0700
Subject: [PATCH 60/91] fix: DeviceSpecificState error

---
 .../include/realm-backend/task_result.h       | 28 +++++++++++----
 lib/realm-backend/src/driver.cc               |  1 +
 .../src/realm_training_backing.cc             |  8 ++---
 lib/realm-backend/src/task_result.cc          | 35 -------------------
 lib/realm-backend/src/task_wrapper.cc         |  7 ++--
 5 files changed, 30 insertions(+), 49 deletions(-)

diff --git a/lib/realm-backend/include/realm-backend/task_result.h b/lib/realm-backend/include/realm-backend/task_result.h
index bac20ddd14..cebaf8ccb6 100644
--- a/lib/realm-backend/include/realm-backend/task_result.h
+++ b/lib/realm-backend/include/realm-backend/task_result.h
@@ -19,11 +19,25 @@ template <typename T> struct SharedState {
   Realm::RegionInstance inst;
 
   SharedState() = delete;
-  SharedState(Realm::Memory);
-  void set_event(Realm::Event);
-  void set_value(T &&);
-  void wait();
-  T get_value();
+  SharedState(Realm::Memory mem) {
+    Realm::Rect<1> bounds(Realm::Point<1>(0), Realm::Point<1>(0));
+    this->inst = Realm::RegionInstance::NO_INST;
+    Realm::RegionInstance::create_instance(
+        this->inst, mem, bounds, {sizeof(T)}, /*SOA*/ 1,
+        Realm::ProfilingRequestSet(), Realm::Event::NO_EVENT)
+        .wait();
+  }
+  void set_event(Realm::Event e) { this->event = e; }
+  void set_value(T &&value) {
+    Realm::GenericAccessor<T, 1> acc(this->inst, 0);
+    acc[Realm::Point<1>(0)] = std::move(value);
+  }
+  void wait() { this->event.wait(); }
+  T get_value() {
+    wait();
+    Realm::GenericAccessor<T, 1> acc(this->inst, 0);
+    return acc[Realm::Point<1>(0)];
+  }
 };
 
 // Specialization of SharedState for the `void` type, as it does not carry a
@@ -33,8 +47,8 @@ template <> struct SharedState<void> {
   Realm::Event event = Realm::Event::NO_EVENT;
 
   SharedState() = default;
-  void set_event(Realm::Event);
-  void wait();
+  void set_event(Realm::Event e) { this->event = e; }
+  void wait() { this->event.wait(); }
 };
 
 /**
diff --git a/lib/realm-backend/src/driver.cc b/lib/realm-backend/src/driver.cc
index 8cfb038d97..3f02bf7098 100644
--- a/lib/realm-backend/src/driver.cc
+++ b/lib/realm-backend/src/driver.cc
@@ -18,6 +18,7 @@ int main(int argc, const char **argv) {
   Processor p = Machine::ProcessorQuery(Machine::get_machine())
                     .only_kind(Processor::LOC_PROC)
                     .first();
+  assert(p.exists());
 
   rt.shutdown(rt.collective_spawn(p, static_cast<Processor::TaskFuncID>(task_id_t::TOP_LEVEL_TASK_ID), 0, 0));
   return rt.wait_for_shutdown();
diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc
index 17463ec4ec..7879d63231 100644
--- a/lib/realm-backend/src/realm_training_backing.cc
+++ b/lib/realm-backend/src/realm_training_backing.cc
@@ -149,16 +149,16 @@ initialize_args_backing(RealmTrainingBacking *backing,
       TaskImplFunction impl_function =
           task_registry.task_mapping.at(task_id).impl_function;
       // TODO: multi gpu launching
-      Promise<DeviceSpecificDeviceStates> promise(master_mem);
-      Future<DeviceSpecificDeviceStates> future = promise.get_future();
-      RealmTaskArgs<DeviceSpecificDeviceStates> args{
+      Promise<std::optional<DeviceSpecificDeviceStates>> promise(master_mem);
+      Future<std::optional<DeviceSpecificDeviceStates>> future = promise.get_future();
+      RealmTaskArgs<std::optional<DeviceSpecificDeviceStates>> args{
           task_id, impl_function, accessor, std::move(promise)};
       Event e =
           worker_procs[0].spawn(static_cast<Processor::TaskFuncID>(task_id),
                                 &args, sizeof(args), worker_events[0]);
       worker_events[0] = e;
       future.set_event(e);
-      per_device_op_states.insert({node, std::move(future.get())});
+      per_device_op_states.insert({node, std::move(future.get().value())});
     }
   }
 
diff --git a/lib/realm-backend/src/task_result.cc b/lib/realm-backend/src/task_result.cc
index 05aa1a8a9c..e69de29bb2 100644
--- a/lib/realm-backend/src/task_result.cc
+++ b/lib/realm-backend/src/task_result.cc
@@ -1,35 +0,0 @@
-#include "realm-backend/task_result.h"
-
-namespace FlexFlow {
-
-/************ SharedState implementation ************/
-template <typename T> SharedState<T>::SharedState(Realm::Memory mem) {
-  Realm::Rect<1> bounds(Realm::Point<1>(0), Realm::Point<1>(0));
-  this->inst = Realm::RegionInstance::NO_INST;
-  Realm::RegionInstance::create_instance(
-      this->inst, mem, bounds, {sizeof(T)}, /*SOA*/ 1,
-      Realm::ProfilingRequestSet(), Realm::Event::NO_EVENT)
-      .wait();
-}
-
-template <typename T> void SharedState<T>::set_event(Realm::Event e) {
-  this->event = e;
-}
-
-template <typename T> void SharedState<T>::set_value(T &&value) {
-  Realm::GenericAccessor<T, 1> acc(this->inst, 0);
-  acc[Realm::Point<1>(0)] = std::move(value);
-}
-
-template <typename T> void SharedState<T>::wait() { this->event.wait(); }
-
-template <typename T> T SharedState<T>::get_value() {
-  wait();
-  Realm::GenericAccessor<T, 1> acc(this->inst, 0);
-  return acc[Realm::Point<1>(0)];
-}
-
-void SharedState<void>::set_event(Realm::Event e) { this->event = e; }
-
-void SharedState<void>::wait() { this->event.wait(); }
-} // namespace FlexFlow
\ No newline at end of file
diff --git a/lib/realm-backend/src/task_wrapper.cc b/lib/realm-backend/src/task_wrapper.cc
index ea36275462..ca5ff4f4fd 100644
--- a/lib/realm-backend/src/task_wrapper.cc
+++ b/lib/realm-backend/src/task_wrapper.cc
@@ -1,4 +1,5 @@
 #include "realm-backend/task_wrapper.h"
+#include <optional>
 
 namespace FlexFlow {
 
@@ -6,12 +7,12 @@ using namespace Realm;
 
 void init_wrapper_task(const void *args, size_t arglen, const void *userdata,
                        size_t userlen, Processor p) {
-  RealmTaskArgs<DeviceSpecificDeviceStates> const &task_args =
-      *reinterpret_cast<const RealmTaskArgs<DeviceSpecificDeviceStates> *>(args);
+  RealmTaskArgs<std::optional<DeviceSpecificDeviceStates>> const &task_args =
+      *reinterpret_cast<const RealmTaskArgs<std::optional<DeviceSpecificDeviceStates>> *>(args);
   auto fn =
       task_args.impl_function.get<InitOpTaskImplFunction>().function_ptr;
   DeviceSpecificDeviceStates result = fn(task_args.accessor);
-  task_args.promise.set_value(std::move(result));
+  task_args.promise.set_value(std::make_optional(result));
 }
 
 void fwdbwd_wrapper_task(const void *args, size_t arglen, const void *userdata,

From b55aed70135e9e9e9422f2ac6736e267f404e9da Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 19 Mar 2025 01:58:40 -0700
Subject: [PATCH 61/91] fix: realm task id should start from
 `Processor::TASK_ID_FIRST_AVAILABLE`

---
 .proj.toml                                    |  5 +++++
 .../include/realm-backend/driver.h            |  2 ++
 lib/realm-backend/src/driver.cc               | 22 ++++++++++++-------
 .../src/realm_training_backing.cc             | 10 ++++-----
 lib/realm-backend/src/task_wrapper.cc         |  6 ++---
 lib/realm-backend/test/CMakeLists.txt         |  5 +----
 6 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/.proj.toml b/.proj.toml
index a06fb53c3a..6b2909ef2a 100644
--- a/.proj.toml
+++ b/.proj.toml
@@ -43,6 +43,11 @@ type = "lib"
 tests = true
 benchmarks = false
 
+[targets.realm-backend]
+type = "lib"
+tests = false
+benchmarks = false
+
 [targets.models]
 type = "lib"
 tests = true
diff --git a/lib/realm-backend/include/realm-backend/driver.h b/lib/realm-backend/include/realm-backend/driver.h
index 884b97a23d..d4b373099b 100644
--- a/lib/realm-backend/include/realm-backend/driver.h
+++ b/lib/realm-backend/include/realm-backend/driver.h
@@ -5,6 +5,8 @@
 #include "realm/cmdline.h"
 #include "task-spec/op_task_invocation.h"
 
+Realm::Processor::TaskFuncID get_realm_task_id(FlexFlow::task_id_t task_id);
+
 void top_level_task(const void *args, size_t arglen, const void *userdata,
                size_t userlen, Realm::Processor p);
 
diff --git a/lib/realm-backend/src/driver.cc b/lib/realm-backend/src/driver.cc
index 3f02bf7098..e656836c10 100644
--- a/lib/realm-backend/src/driver.cc
+++ b/lib/realm-backend/src/driver.cc
@@ -3,16 +3,19 @@
 using namespace Realm;
 using namespace FlexFlow;
 
-Logger log_app("app");
+Processor::TaskFuncID get_realm_task_id(task_id_t task_id) {
+  return static_cast<Processor::TaskFuncID>(task_id) +
+         Processor::TASK_ID_FIRST_AVAILABLE;
+}
 
-int main(int argc, const char **argv) {
+int main(int argc, char **argv) {
   Runtime rt;
-  rt.init(&argc, (char ***)&argv);
+  rt.init(&argc, &argv);
 
-  Processor::register_task_by_kind(Processor::LOC_PROC, false /*!global*/, 
-                                    static_cast<Processor::TaskFuncID>(task_id_t::TOP_LEVEL_TASK_ID),
-                                   CodeDescriptor(top_level_task),
-                                   ProfilingRequestSet())
+  Processor::register_task_by_kind(
+      Processor::LOC_PROC, false /*!global*/,
+      get_realm_task_id(task_id_t::TOP_LEVEL_TASK_ID),
+      CodeDescriptor(top_level_task), ProfilingRequestSet())
       .external_wait();
 
   Processor p = Machine::ProcessorQuery(Machine::get_machine())
@@ -20,6 +23,9 @@ int main(int argc, const char **argv) {
                     .first();
   assert(p.exists());
 
-  rt.shutdown(rt.collective_spawn(p, static_cast<Processor::TaskFuncID>(task_id_t::TOP_LEVEL_TASK_ID), 0, 0));
+  Event e = rt.collective_spawn(
+      p, get_realm_task_id(task_id_t::TOP_LEVEL_TASK_ID), 0, 0);
+  rt.shutdown(e);
+
   return rt.wait_for_shutdown();
 }
diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc
index 7879d63231..d1a25b2788 100644
--- a/lib/realm-backend/src/realm_training_backing.cc
+++ b/lib/realm-backend/src/realm_training_backing.cc
@@ -154,7 +154,7 @@ initialize_args_backing(RealmTrainingBacking *backing,
       RealmTaskArgs<std::optional<DeviceSpecificDeviceStates>> args{
           task_id, impl_function, accessor, std::move(promise)};
       Event e =
-          worker_procs[0].spawn(static_cast<Processor::TaskFuncID>(task_id),
+          worker_procs[0].spawn(get_realm_task_id(task_id),
                                 &args, sizeof(args), worker_events[0]);
       worker_events[0] = e;
       future.set_event(e);
@@ -202,7 +202,7 @@ execute_forward(RealmTrainingBacking &realm_training_backing,
     RealmTaskArgs<float> args{task_id, impl_function, accessor,
                                 std::move(promise)};
     Event e = realm_training_backing.worker_procs[0].spawn(
-        static_cast<Processor::TaskFuncID>(task_id), &args, sizeof(args),
+        get_realm_task_id(task_id), &args, sizeof(args),
         realm_training_backing.worker_events[0]);
     realm_training_backing.worker_events[0] = e;
     future.set_event(e);
@@ -249,7 +249,7 @@ execute_backward(RealmTrainingBacking &realm_training_backing,
     RealmTaskArgs<float> args{task_id, impl_function, accessor,
                                 std::move(promise)};
     Event e = realm_training_backing.worker_procs[0].spawn(
-        static_cast<Processor::TaskFuncID>(task_id), &args, sizeof(args),
+        get_realm_task_id(task_id), &args, sizeof(args),
         realm_training_backing.worker_events[0]);
     realm_training_backing.worker_events[0] = e;
     future.set_event(e);
@@ -299,7 +299,7 @@ Future<void> execute_update(RealmTrainingBacking &realm_training_backing,
     RealmTaskArgs<void> args{task_id, update_impl_fn, accessor,
                              std::move(promise)};
     Event e = realm_training_backing.worker_procs[0].spawn(
-        static_cast<Processor::TaskFuncID>(task_id), &args, sizeof(args),
+        get_realm_task_id(task_id), &args, sizeof(args),
         realm_training_backing.worker_events[0]);
     realm_training_backing.worker_events[0] = e;
     future.set_event(e);
@@ -334,7 +334,7 @@ Future<void> compute_loss(RealmTrainingBacking &realm_training_backing,
   RealmTaskArgs<void> args{task_id, loss_impl_fn, loss_accessor,
                            std::move(promise)};
   Event e = realm_training_backing.worker_procs[0].spawn(
-      static_cast<Processor::TaskFuncID>(task_id), &args, sizeof(args),
+      get_realm_task_id(task_id), &args, sizeof(args),
       realm_training_backing.worker_events[0]);
   realm_training_backing.worker_events[0] = e;
   future.set_event(e);
diff --git a/lib/realm-backend/src/task_wrapper.cc b/lib/realm-backend/src/task_wrapper.cc
index ca5ff4f4fd..e628f40ad8 100644
--- a/lib/realm-backend/src/task_wrapper.cc
+++ b/lib/realm-backend/src/task_wrapper.cc
@@ -36,21 +36,21 @@ void generic_wrapper_task(const void *args, size_t arglen, const void *userdata,
 
 void register_wrapper_tasks_init(Processor p, task_id_t task_id) {
   Processor::register_task_by_kind(
-      p.kind(), false /*!global*/, static_cast<Processor::TaskFuncID>(task_id),
+      p.kind(), false /*!global*/, get_realm_task_id(task_id),
       CodeDescriptor(init_wrapper_task), ProfilingRequestSet())
       .external_wait();
 }
 
 void register_wrapper_tasks_fwdbwd(Realm::Processor p, task_id_t task_id) {
   Processor::register_task_by_kind(
-      p.kind(), false /*!global*/, static_cast<Processor::TaskFuncID>(task_id),
+      p.kind(), false /*!global*/, get_realm_task_id(task_id),
       CodeDescriptor(fwdbwd_wrapper_task), ProfilingRequestSet())
       .external_wait();
 }
 
 void register_wrapper_tasks_generic(Realm::Processor p, task_id_t task_id) {
   Processor::register_task_by_kind(
-      p.kind(), false /*!global*/, static_cast<Processor::TaskFuncID>(task_id),
+      p.kind(), false /*!global*/, get_realm_task_id(task_id),
       CodeDescriptor(generic_wrapper_task), ProfilingRequestSet())
       .external_wait();
 }
diff --git a/lib/realm-backend/test/CMakeLists.txt b/lib/realm-backend/test/CMakeLists.txt
index e180208fbc..6658784d9e 100644
--- a/lib/realm-backend/test/CMakeLists.txt
+++ b/lib/realm-backend/test/CMakeLists.txt
@@ -1,4 +1,4 @@
-ff_add_test_executable(
+ff_add_executable(
   NAME
     realm-backend-tests
   SRC_PATTERNS
@@ -6,10 +6,7 @@ ff_add_test_executable(
   PRIVATE_INCLUDE 
     src/
   DEPS
-    utils-test-common
     realm-backend
-    kernels
-    op-attrs
 )
 
 set(FF_TEST_EXEC_NAME "realm-backend-tests")

From a921775f8b3916b103829adee9e9390ba7f74452 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 19 Mar 2025 05:24:46 -0700
Subject: [PATCH 62/91] fix: RealmTrainingBacking initialization

---
 .../realm-backend/realm_training_backing.h    |  9 ++-
 .../src/realm_training_backing.cc             | 74 ++++++++-----------
 lib/realm-backend/src/task_result.cc          |  0
 3 files changed, 36 insertions(+), 47 deletions(-)
 delete mode 100644 lib/realm-backend/src/task_result.cc

diff --git a/lib/realm-backend/include/realm-backend/realm_training_backing.h b/lib/realm-backend/include/realm-backend/realm_training_backing.h
index c695dc1a46..8fe842daf6 100644
--- a/lib/realm-backend/include/realm-backend/realm_training_backing.h
+++ b/lib/realm-backend/include/realm-backend/realm_training_backing.h
@@ -45,13 +45,16 @@ struct RealmTrainingBacking {
   std::vector<Realm::Event> worker_events;
   std::vector<Allocator> allocators;
 
-  RealmTensorBacking realm_tensor_backing;
-  RealmArgsBacking realm_args_backing;
-
   ComputationGraph computation_graph;
   TaskRegistry task_registry;
+
+  RealmTensorBacking realm_tensor_backing;
+  RealmArgsBacking realm_args_backing;
 };
 
+TaskRegistry construct_task_registry_and_register_tasks_for_realm(
+    ComputationGraph const &, std::vector<Realm::Processor> const &);
+
 RealmArgsBacking initialize_args_backing(RealmTrainingBacking *,
                                         ComputationGraph const &,
                                         RuntimeArgConfig const &);
diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc
index d1a25b2788..ee46105b31 100644
--- a/lib/realm-backend/src/realm_training_backing.cc
+++ b/lib/realm-backend/src/realm_training_backing.cc
@@ -30,40 +30,24 @@ RealmTrainingBacking::RealmTrainingBacking(
     GradientTensorSource &gradient_tensor_source,
     ComputationGraph const &computation_graph,
     RuntimeArgConfig const &runtime_arg_config)
-    : master_proc(master_proc), worker_procs(worker_procs),
+    : master_proc(master_proc), master_event(Realm::Event::NO_EVENT),
+      master_mem(Machine::MemoryQuery(Machine::get_machine())
+                     .only_kind(Memory::SYSTEM_MEM)
+                     .best_affinity_to(master_proc)
+                     .first()),
+    worker_procs(worker_procs),
+    worker_events(std::vector<Realm::Event>(worker_procs.size(),
+                                           Realm::Event::NO_EVENT)),
       allocators(allocators), computation_graph(computation_graph),
-      task_registry(construct_task_registry(
-          get_layer_attrs_mapping(computation_graph))),
+      task_registry(construct_task_registry_and_register_tasks_for_realm(
+          computation_graph, worker_procs)),
       realm_tensor_backing(construct_realm_tensor_backing( // TODO: multi gpu
         allocated_tensors,
         generate_unallocated_tensors(
             allocated_tensors, get_all_tensor_attrs(computation_graph),
             gradient_tensor_source),
         this->allocators[0])),
-      realm_args_backing(initialize_args_backing(this, computation_graph, runtime_arg_config)) {
-  master_event = Realm::Event::NO_EVENT;
-  master_mem = Machine::MemoryQuery(Machine::get_machine())
-                   .only_kind(Memory::SYSTEM_MEM)
-                   .best_affinity_to(master_proc)
-                   .first();
-  for (Processor p : worker_procs) {
-    worker_events.push_back(Realm::Event::NO_EVENT);
-  }
-
-  // register tasks for realm
-  std::unordered_map<layer_guid_t, LayerAttrs> const &layer_attrs_mapping =
-      get_layer_attrs_mapping(this->computation_graph);
-  for (std::pair<layer_guid_t, LayerAttrs> const &layer_attrs :
-      layer_attrs_mapping) {
-    ComputationGraphOpAttrs attrs = layer_attrs.second.op_attrs;
-    std::vector<task_id_t> task_ids = get_task_ids(attrs);
-    for (task_id_t task_id : task_ids) {
-        TaskSignatureAndImpl task_signature_impl = get_task_sig_impl(task_id);
-        // TODO: multi gpu
-        register_wrapper_tasks(worker_procs[0], task_id, task_signature_impl);
-    }
-  }
-}
+      realm_args_backing(initialize_args_backing(this, computation_graph, runtime_arg_config)) {}
 
 RealmTrainingBacking::RealmTrainingBacking(
     Processor master_proc, std::vector<Processor> const &worker_procs,
@@ -74,10 +58,17 @@ RealmTrainingBacking::RealmTrainingBacking(
     ComputationGraph const &computation_graph,
     RuntimeArgConfig const &runtime_arg_config,
     OptimizerAttrs const &optimizer_attrs)
-    : master_proc(master_proc), worker_procs(worker_procs),
+    : master_proc(master_proc), master_event(Realm::Event::NO_EVENT),
+      master_mem(Machine::MemoryQuery(Machine::get_machine())
+                     .only_kind(Memory::SYSTEM_MEM)
+                     .best_affinity_to(master_proc)
+                     .first()),
+    worker_procs(worker_procs),
+    worker_events(std::vector<Realm::Event>(worker_procs.size(),
+                                           Realm::Event::NO_EVENT)),
       allocators(allocators), computation_graph(computation_graph),
-      task_registry(construct_task_registry(
-          get_layer_attrs_mapping(computation_graph))),
+      task_registry(construct_task_registry_and_register_tasks_for_realm(
+          computation_graph, worker_procs)),
     realm_tensor_backing(construct_realm_tensor_backing( // TODO: multi gpu
         allocated_tensors,
         generate_unallocated_tensors_with_optimizer(
@@ -85,19 +76,16 @@ RealmTrainingBacking::RealmTrainingBacking(
             gradient_tensor_source, optimizer_tensor_source,
             optimizer_attrs),
         this->allocators[0])),
-      realm_args_backing(initialize_args_backing(this, computation_graph, runtime_arg_config)) {
-  master_event = Realm::Event::NO_EVENT;
-  master_mem = Machine::MemoryQuery(Machine::get_machine())
-                   .only_kind(Memory::SYSTEM_MEM)
-                   .best_affinity_to(master_proc)
-                   .first();
-  for (Processor p : worker_procs) {
-    worker_events.push_back(Realm::Event::NO_EVENT);
-  }
+      realm_args_backing(initialize_args_backing(this, computation_graph, runtime_arg_config)) {}
+
+TaskRegistry construct_task_registry_and_register_tasks_for_realm(
+    ComputationGraph const &cg, std::vector<Realm::Processor> const &worker_procs) {
+  TaskRegistry task_registry = construct_task_registry(
+    get_layer_attrs_mapping(cg));
 
   // register tasks for realm
   std::unordered_map<layer_guid_t, LayerAttrs> const &layer_attrs_mapping =
-      get_layer_attrs_mapping(this->computation_graph);
+      get_layer_attrs_mapping(cg);
   for (std::pair<layer_guid_t, LayerAttrs> const &layer_attrs :
       layer_attrs_mapping) {
     ComputationGraphOpAttrs attrs = layer_attrs.second.op_attrs;
@@ -108,16 +96,14 @@ RealmTrainingBacking::RealmTrainingBacking(
         register_wrapper_tasks(worker_procs[0], task_id, task_signature_impl);
     }
   }
+
+  return task_registry;
 }
 
 RealmArgsBacking
 initialize_args_backing(RealmTrainingBacking *backing,
                         ComputationGraph const &cg,
                         RuntimeArgConfig const &runtime_arg_config) {
-  // initialize_args_backing(TaskRegistry const &task_registry,
-  //                         ComputationGraph const &cg,
-  //                         RuntimeArgConfig const &runtime_arg_config,
-  //                         RealmTensorBacking const &realm_tensor_backing) {
   std::unordered_map<layer_guid_t, DeviceSpecificDeviceStates>
       per_device_op_states;
   TaskRegistry const &task_registry = backing->task_registry;
diff --git a/lib/realm-backend/src/task_result.cc b/lib/realm-backend/src/task_result.cc
deleted file mode 100644
index e69de29bb2..0000000000

From a708496e92216e53804e6aa9f82588996c779fc3 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 19 Mar 2025 10:31:30 -0700
Subject: [PATCH 63/91] fix: bugs with DeviceSpecificDeviceStates...

---
 .../include/realm-backend/task_result.h       | 51 ++++++++++---------
 .../include/realm-backend/task_wrapper.h      |  2 +-
 lib/realm-backend/src/realm_args_backing.cc   |  9 ----
 lib/realm-backend/src/task_wrapper.cc         | 14 ++++-
 4 files changed, 41 insertions(+), 35 deletions(-)

diff --git a/lib/realm-backend/include/realm-backend/task_result.h b/lib/realm-backend/include/realm-backend/task_result.h
index cebaf8ccb6..19cd91f104 100644
--- a/lib/realm-backend/include/realm-backend/task_result.h
+++ b/lib/realm-backend/include/realm-backend/task_result.h
@@ -3,7 +3,7 @@
 
 #include "realm-backend/driver.h"
 #include <cassert>
-#include <memory>
+#include <optional>
 
 namespace FlexFlow {
 
@@ -16,27 +16,30 @@ template <typename T> struct SharedState {
   // synchronization primitives
   Realm::Event event = Realm::Event::NO_EVENT;
   // where the result is stored
-  Realm::RegionInstance inst;
+  Realm::RegionInstance inst = Realm::RegionInstance::NO_INST;
 
-  SharedState() = delete;
+  SharedState() = default;
   SharedState(Realm::Memory mem) {
     Realm::Rect<1> bounds(Realm::Point<1>(0), Realm::Point<1>(0));
-    this->inst = Realm::RegionInstance::NO_INST;
     Realm::RegionInstance::create_instance(
         this->inst, mem, bounds, {sizeof(T)}, /*SOA*/ 1,
         Realm::ProfilingRequestSet(), Realm::Event::NO_EVENT)
         .wait();
   }
   void set_event(Realm::Event e) { this->event = e; }
-  void set_value(T &&value) {
+  void set_value(T &&value) const {
+    assert(this->inst.exists());
     Realm::GenericAccessor<T, 1> acc(this->inst, 0);
     acc[Realm::Point<1>(0)] = std::move(value);
   }
   void wait() { this->event.wait(); }
   T get_value() {
     wait();
+    assert(this->inst.exists());
     Realm::GenericAccessor<T, 1> acc(this->inst, 0);
-    return acc[Realm::Point<1>(0)];
+    T value = acc[Realm::Point<1>(0)];
+    this->inst.destroy();
+    return value;
   }
 };
 
@@ -59,34 +62,34 @@ template <> struct SharedState<void> {
  */
 template <typename T> class Future {
 public:
-  explicit Future(std::shared_ptr<SharedState<T>> state)
-      : state_(std::move(state)) {}
+  explicit Future(SharedState<T> state) : state_(state) {}
   explicit Future() = default;
   explicit Future(T value) : value_(std::move(value)) {}
-  void set_event(Realm::Event e) { state_->set_event(e); }
+  void set_event(Realm::Event e) { state_.set_event(e); }
   T get() {
-    value_ = std::make_optional(state_->get_value());
+    if (!value_.has_value()) {
+      value_ = std::make_optional(state_.get_value());
+    }
     return value_.value();
   }
-  void wait() { state_->wait(); }
+  void wait() { state_.wait(); }
 
 private:
-  std::shared_ptr<SharedState<T>> state_;
-  std::optional<T> value_ = std::nullopt;
+  SharedState<T> state_;
+  std::optional<T> value_;
 };
 
 // Specialization of Future for the `void` type, as it does not carry a value.
 template <> class Future<void> {
 public:
-  explicit Future(std::shared_ptr<SharedState<void>> state)
-      : state_(std::move(state)) {}
+  explicit Future(SharedState<void> state) : state_(state) {}
   explicit Future() = default;
-  void set_event(Realm::Event e) { state_->set_event(e); }
-  void get() { state_->wait(); }
-  void wait() { state_->wait(); }
+  void set_event(Realm::Event e) { state_.set_event(e); }
+  void get() { state_.wait(); }
+  void wait() { state_.wait(); }
 
 private:
-  std::shared_ptr<SharedState<void>> state_;
+  SharedState<void> state_;
 };
 
 /**
@@ -97,22 +100,22 @@ template <> class Future<void> {
 template <typename T> class Promise {
 public:
   Promise() = delete;
-  Promise(Realm::Memory mem) : state_(std::make_shared<SharedState<T>>(mem)) {}
+  Promise(Realm::Memory mem) : state_(SharedState<T>(mem)) {}
   Future<T> get_future() { return Future<T>(state_); }
-  void set_value(T &&value) const { state_->set_value(std::move(value)); }
+  void set_value(T &&value) const { state_.set_value(std::move(value)); }
 
 private:
-  std::shared_ptr<SharedState<T>> state_;
+  SharedState<T> state_;
 };
 
 // Specialization of Promise for the `void` type, as it does not carry a value.
 template <> class Promise<void> {
 public:
-  Promise() : state_(std::make_shared<SharedState<void>>()) {}
+  Promise() : state_(SharedState<void>()) {}
   Future<void> get_future() { return Future<void>(state_); }
 
 private:
-  std::shared_ptr<SharedState<void>> state_;
+  SharedState<void> state_;
 };
 
 } // namespace FlexFlow
diff --git a/lib/realm-backend/include/realm-backend/task_wrapper.h b/lib/realm-backend/include/realm-backend/task_wrapper.h
index 89521becf4..8265ca398b 100644
--- a/lib/realm-backend/include/realm-backend/task_wrapper.h
+++ b/lib/realm-backend/include/realm-backend/task_wrapper.h
@@ -31,7 +31,7 @@ void register_wrapper_tasks_fwdbwd(Realm::Processor p, task_id_t task_id);
 
 void register_wrapper_tasks_generic(Realm::Processor p, task_id_t task_id);
 
-void register_wrapper_tasks(Realm::Processor p, task_id_t task_id,
+void register_wrapper_tasks(int pid, Realm::Processor p, task_id_t task_id,
                             TaskSignatureAndImpl task_sig_impl);
 
 } // namespace FlexFlow
diff --git a/lib/realm-backend/src/realm_args_backing.cc b/lib/realm-backend/src/realm_args_backing.cc
index e20fcdc14d..d30793a801 100644
--- a/lib/realm-backend/src/realm_args_backing.cc
+++ b/lib/realm-backend/src/realm_args_backing.cc
@@ -7,15 +7,6 @@
 
 namespace FlexFlow {
 
-// void RealmArgsBacking::add_per_device_op_state(
-//     layer_guid_t const &op_guid, Future<DeviceSpecificDeviceStates> &&future)
-//     {
-//   if (per_device_op_states.find(op_guid) != per_device_op_states.end()) {
-//     throw mk_runtime_error("Op state already exists");
-//   }
-//   per_device_op_states.insert({op_guid, std::move(future)});
-// }
-
 RealmArgsBacking make_args_backing_with_empty_device_states(
   RuntimeArgConfig const &runtime_arg_config) {
 return RealmArgsBacking{runtime_arg_config, {}};
diff --git a/lib/realm-backend/src/task_wrapper.cc b/lib/realm-backend/src/task_wrapper.cc
index e628f40ad8..7894a90672 100644
--- a/lib/realm-backend/src/task_wrapper.cc
+++ b/lib/realm-backend/src/task_wrapper.cc
@@ -1,10 +1,16 @@
 #include "realm-backend/task_wrapper.h"
 #include <optional>
+#include <unordered_set>
 
 namespace FlexFlow {
 
 using namespace Realm;
 
+// After get device specific states from init task, storage a copy here to avoid auto destruction.
+std::vector<DeviceSpecificDeviceStates> device_state_storage;
+
+std::unordered_set<std::pair<int, task_id_t>> registered_tasks;
+
 void init_wrapper_task(const void *args, size_t arglen, const void *userdata,
                        size_t userlen, Processor p) {
   RealmTaskArgs<std::optional<DeviceSpecificDeviceStates>> const &task_args =
@@ -12,6 +18,7 @@ void init_wrapper_task(const void *args, size_t arglen, const void *userdata,
   auto fn =
       task_args.impl_function.get<InitOpTaskImplFunction>().function_ptr;
   DeviceSpecificDeviceStates result = fn(task_args.accessor);
+  device_state_storage.push_back(result);
   task_args.promise.set_value(std::make_optional(result));
 }
 
@@ -55,8 +62,13 @@ void register_wrapper_tasks_generic(Realm::Processor p, task_id_t task_id) {
       .external_wait();
 }
 
-void register_wrapper_tasks(Processor p, task_id_t task_id,
+void register_wrapper_tasks(int p_id, Processor p, task_id_t task_id,
                             TaskSignatureAndImpl task_sig_impl) {
+  std::pair<int, task_id_t> key = {p_id, task_id};
+  if (registered_tasks.find(key) != registered_tasks.end()) {
+    return;
+  }
+  registered_tasks.insert(key);
   switch (task_sig_impl.task_signature.type) {
   case OpTaskType::INIT:
     register_wrapper_tasks_init(p, task_id);

From 6e9c9af605fa73e82f190aca8de6c17a163a082f Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 19 Mar 2025 10:39:17 -0700
Subject: [PATCH 64/91] tests: pass test_update

---
 .../include/realm-backend/task_result.h       | 39 +++++++++++++++++++
 .../src/realm_training_backing.cc             | 10 ++---
 lib/realm-backend/src/task_wrapper.cc         |  9 ++---
 lib/realm-backend/test/src/test_update.cc     | 16 +++++---
 4 files changed, 57 insertions(+), 17 deletions(-)

diff --git a/lib/realm-backend/include/realm-backend/task_result.h b/lib/realm-backend/include/realm-backend/task_result.h
index 19cd91f104..d869982563 100644
--- a/lib/realm-backend/include/realm-backend/task_result.h
+++ b/lib/realm-backend/include/realm-backend/task_result.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_LOCAL_EXECUTION_TASK_RESULT_H
 
 #include "realm-backend/driver.h"
+#include "realm-backend/realm_task_argument_accessor.h"
 #include <cassert>
 #include <optional>
 
@@ -92,6 +93,24 @@ template <> class Future<void> {
   SharedState<void> state_;
 };
 
+template <> class Future<DeviceSpecificDeviceStates> {
+public:
+  explicit Future(
+      std::shared_ptr<std::optional<DeviceSpecificDeviceStates>> value)
+      : value_(value) {}
+  Future() = delete;
+  void set_event(Realm::Event e) { event_ = e; }
+  std::optional<DeviceSpecificDeviceStates> get() {
+    wait();
+    return *value_;
+  }
+  void wait() { event_.wait(); }
+
+private:
+  Realm::Event event_;
+  std::shared_ptr<std::optional<DeviceSpecificDeviceStates>> value_;
+};
+
 /**
  * @brief Promise class template that allows setting a result in a SharedState
  * object. It is used to fulfill a Future with a value, and provides methods to
@@ -118,6 +137,26 @@ template <> class Promise<void> {
   SharedState<void> state_;
 };
 
+// Specialization of Promise for the `DeviceSpecificDeviceStates` type.
+// It has an inner shared_ptr value, so we need to find a way to avoid the value
+// to deconstruct early. `shared_ptr` can work because DeveiceState will stored
+// in the same node with the device that launch init task. Wrap a std::optional
+// because we don't know the specific DeviceSpecificDeviceStates size.
+template <> class Promise<DeviceSpecificDeviceStates> {
+public:
+  Promise()
+      : value_(std::make_shared<std::optional<DeviceSpecificDeviceStates>>()) {}
+  void set_value(DeviceSpecificDeviceStates value) const {
+    *value_ = std::make_optional(value);
+  }
+  Future<DeviceSpecificDeviceStates> get_future() {
+    return Future<DeviceSpecificDeviceStates>(value_);
+  }
+
+private:
+  std::shared_ptr<std::optional<DeviceSpecificDeviceStates>> value_;
+};
+
 } // namespace FlexFlow
 
 #endif
\ No newline at end of file
diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc
index ee46105b31..3b7eb48823 100644
--- a/lib/realm-backend/src/realm_training_backing.cc
+++ b/lib/realm-backend/src/realm_training_backing.cc
@@ -93,7 +93,7 @@ TaskRegistry construct_task_registry_and_register_tasks_for_realm(
     for (task_id_t task_id : task_ids) {
         TaskSignatureAndImpl task_signature_impl = get_task_sig_impl(task_id);
         // TODO: multi gpu
-        register_wrapper_tasks(worker_procs[0], task_id, task_signature_impl);
+        register_wrapper_tasks(0, worker_procs[0], task_id, task_signature_impl);
     }
   }
 
@@ -135,16 +135,16 @@ initialize_args_backing(RealmTrainingBacking *backing,
       TaskImplFunction impl_function =
           task_registry.task_mapping.at(task_id).impl_function;
       // TODO: multi gpu launching
-      Promise<std::optional<DeviceSpecificDeviceStates>> promise(master_mem);
-      Future<std::optional<DeviceSpecificDeviceStates>> future = promise.get_future();
-      RealmTaskArgs<std::optional<DeviceSpecificDeviceStates>> args{
+      Promise<DeviceSpecificDeviceStates> promise = Promise<DeviceSpecificDeviceStates>();
+      Future<DeviceSpecificDeviceStates> future = promise.get_future();
+      RealmTaskArgs<DeviceSpecificDeviceStates> args{
           task_id, impl_function, accessor, std::move(promise)};
       Event e =
           worker_procs[0].spawn(get_realm_task_id(task_id),
                                 &args, sizeof(args), worker_events[0]);
       worker_events[0] = e;
       future.set_event(e);
-      per_device_op_states.insert({node, std::move(future.get().value())});
+      per_device_op_states.insert({node, future.get().value()});
     }
   }
 
diff --git a/lib/realm-backend/src/task_wrapper.cc b/lib/realm-backend/src/task_wrapper.cc
index 7894a90672..f07f11b60d 100644
--- a/lib/realm-backend/src/task_wrapper.cc
+++ b/lib/realm-backend/src/task_wrapper.cc
@@ -6,20 +6,17 @@ namespace FlexFlow {
 
 using namespace Realm;
 
-// After get device specific states from init task, storage a copy here to avoid auto destruction.
-std::vector<DeviceSpecificDeviceStates> device_state_storage;
 
 std::unordered_set<std::pair<int, task_id_t>> registered_tasks;
 
 void init_wrapper_task(const void *args, size_t arglen, const void *userdata,
                        size_t userlen, Processor p) {
-  RealmTaskArgs<std::optional<DeviceSpecificDeviceStates>> const &task_args =
-      *reinterpret_cast<const RealmTaskArgs<std::optional<DeviceSpecificDeviceStates>> *>(args);
+  RealmTaskArgs<DeviceSpecificDeviceStates> const &task_args =
+      *reinterpret_cast<const RealmTaskArgs<DeviceSpecificDeviceStates> *>(args);
   auto fn =
       task_args.impl_function.get<InitOpTaskImplFunction>().function_ptr;
   DeviceSpecificDeviceStates result = fn(task_args.accessor);
-  device_state_storage.push_back(result);
-  task_args.promise.set_value(std::make_optional(result));
+  task_args.promise.set_value(result);
 }
 
 void fwdbwd_wrapper_task(const void *args, size_t arglen, const void *userdata,
diff --git a/lib/realm-backend/test/src/test_update.cc b/lib/realm-backend/test/src/test_update.cc
index 1023399c8a..77462e2588 100644
--- a/lib/realm-backend/test/src/test_update.cc
+++ b/lib/realm-backend/test/src/test_update.cc
@@ -21,6 +21,7 @@ void top_level_task(const void *args, size_t arglen, const void *userdata,
   std::vector<Allocator> allocators;
   Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine())
                                    .only_kind(Processor::TOC_PROC);
+  assert(pq.count() > 0);
   for (Processor p : pq) {
     worker_procs.push_back(p);
     allocators.push_back(create_realm_memory_allocator(p));
@@ -73,7 +74,7 @@ void top_level_task(const void *args, size_t arglen, const void *userdata,
   int test_id = 0;
 
   {
-    printf("Running test %d: SGDOptimizerAttrs, momentum=0\n", ++test_id);
+    printf("Running test %d: SGDOptimizerAttrs, momentum=0...", ++test_id);
     OptimizerAttrs optimizer_attrs =
         OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
                                         /*momentum=*/0.0f,
@@ -83,11 +84,12 @@ void top_level_task(const void *args, size_t arglen, const void *userdata,
         p, worker_procs, allocators, allocated_tensors, gradient_tensor_source,
         optimizer_tensor_source, computation_graph, runtime_arg_config,
         optimizer_attrs);
-    execute_update(realm_training_backing, linear_operator.layer, optimizer_attrs);
+    execute_update(realm_training_backing, linear_operator.layer, optimizer_attrs).wait();
+    printf("passed\n");
   }
 
   {
-    printf("Running test %d: SGDOptimizerAttrs, momentum=0.9\n", ++test_id);
+    printf("Running test %d: SGDOptimizerAttrs, momentum=0.9...", ++test_id);
     OptimizerAttrs optimizer_attrs =
         OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
                                         /*momentum=*/0.9,
@@ -97,11 +99,12 @@ void top_level_task(const void *args, size_t arglen, const void *userdata,
         p, worker_procs, allocators, allocated_tensors, gradient_tensor_source,
         optimizer_tensor_source, computation_graph, runtime_arg_config,
         optimizer_attrs);
-    execute_update(realm_training_backing, linear_operator.layer, optimizer_attrs);
+    execute_update(realm_training_backing, linear_operator.layer, optimizer_attrs).wait();
+    printf("passed\n");
   }
   
   {
-    printf("Running test %d: AdamOptimizerAttrs\n", ++test_id);
+    printf("Running test %d: AdamOptimizerAttrs...", ++test_id);
     OptimizerAttrs optimizer_attrs =
         OptimizerAttrs{AdamOptimizerAttrs{/*alpha=*/0.001,
                                         /*beta1=*/0.9,
@@ -115,6 +118,7 @@ void top_level_task(const void *args, size_t arglen, const void *userdata,
         p, worker_procs, allocators, allocated_tensors, gradient_tensor_source,
         optimizer_tensor_source, computation_graph, runtime_arg_config,
         optimizer_attrs);
-    execute_update(realm_training_backing, linear_operator.layer, optimizer_attrs);
+    execute_update(realm_training_backing, linear_operator.layer, optimizer_attrs).wait();
+    printf("passed\n");
   }
 }

From 7b1f653198c63cba94d987213a72189ab0e882bf Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 19 Mar 2025 10:41:08 -0700
Subject: [PATCH 65/91] chore: minor

---
 lib/realm-backend/test/src/test_update.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/realm-backend/test/src/test_update.cc b/lib/realm-backend/test/src/test_update.cc
index 77462e2588..0b332d1ccc 100644
--- a/lib/realm-backend/test/src/test_update.cc
+++ b/lib/realm-backend/test/src/test_update.cc
@@ -74,7 +74,7 @@ void top_level_task(const void *args, size_t arglen, const void *userdata,
   int test_id = 0;
 
   {
-    printf("Running test %d: SGDOptimizerAttrs, momentum=0...", ++test_id);
+    printf("\nRunning test %d: SGDOptimizerAttrs, momentum=0...\n", ++test_id);
     OptimizerAttrs optimizer_attrs =
         OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
                                         /*momentum=*/0.0f,
@@ -89,7 +89,7 @@ void top_level_task(const void *args, size_t arglen, const void *userdata,
   }
 
   {
-    printf("Running test %d: SGDOptimizerAttrs, momentum=0.9...", ++test_id);
+    printf("\nRunning test %d: SGDOptimizerAttrs, momentum=0.9...\n", ++test_id);
     OptimizerAttrs optimizer_attrs =
         OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
                                         /*momentum=*/0.9,
@@ -104,7 +104,7 @@ void top_level_task(const void *args, size_t arglen, const void *userdata,
   }
   
   {
-    printf("Running test %d: AdamOptimizerAttrs...", ++test_id);
+    printf("\nRunning test %d: AdamOptimizerAttrs...\n", ++test_id);
     OptimizerAttrs optimizer_attrs =
         OptimizerAttrs{AdamOptimizerAttrs{/*alpha=*/0.001,
                                         /*beta1=*/0.9,

From 64a82b3478cc4c1da841f84335d0f426f9eb0a2d Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Wed, 30 Apr 2025 11:15:55 -0700
Subject: [PATCH 66/91] Add e2e test

---
 .../local-execution/model_training_instance.h |   1 +
 .../src/model_training_instance.cc            |  10 ++
 lib/local-execution/test/src/test_e2e.cc      | 140 ++++++++++++++++++
 .../test/src/test_local_cost_estimator.cc     |   4 +-
 .../test/src/test_loss_functions.cc           |   2 +-
 .../test/src/test_task_registry.cc            |   2 +-
 lib/local-execution/test/src/test_update.cc   |   2 +-
 7 files changed, 156 insertions(+), 5 deletions(-)
 create mode 100644 lib/local-execution/test/src/test_e2e.cc

diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h
index b36b20ed04..54b76313ab 100644
--- a/lib/local-execution/include/local-execution/model_training_instance.h
+++ b/lib/local-execution/include/local-execution/model_training_instance.h
@@ -30,6 +30,7 @@ struct ModelTrainingInstance {
   PerLayerElapsedTime forward();
   PerLayerElapsedTime backward();
   void update();
+  GenericTensorAccessorW get_loss_tensor_backing();
 };
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc
index d404221d88..f232011230 100644
--- a/lib/local-execution/src/model_training_instance.cc
+++ b/lib/local-execution/src/model_training_instance.cc
@@ -54,4 +54,14 @@ void ModelTrainingInstance::update() {
       get_optimizer_attrs_for_next_iter(this->optimizer_attrs);
 }
 
+GenericTensorAccessorW ModelTrainingInstance::get_loss_tensor_backing() {
+  gradient_tensor_t loss_tensor =
+      this->training_backing.local_tensor_backing
+          .tensor_gradient_mapping.at(this->logit_tensor);
+  GenericTensorAccessorW loss_tensor_backing =
+      this->training_backing.local_tensor_backing.tensor_backings.at(
+          TensorTypeVariant{loss_tensor});
+  return loss_tensor_backing;
+}
+
 } // namespace FlexFlow
diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc
new file mode 100644
index 0000000000..3f3a7ed0bc
--- /dev/null
+++ b/lib/local-execution/test/src/test_e2e.cc
@@ -0,0 +1,140 @@
+#include "kernels/local_cuda_allocator.h"
+#include "kernels/managed_ff_stream.h"
+#include "kernels/managed_per_device_ff_handle.h"
+#include "local-execution/allocated_tensors.h"
+#include "local-execution/local_training_backing.h"
+#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
+#include "pcg/computation_graph.h"
+#include "pcg/computation_graph_builder.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "test_utils.h"
+#include "utils/containers/get_only.h"
+#include "local-execution/model_training_instance.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+bool did_loss_decrease(GenericTensorAccessorW const &first_epoch, GenericTensorAccessorW const & last_epoch) {
+  float* first_epoch_ptr = first_epoch.get_float_ptr();
+  float* last_epoch_ptr = last_epoch.get_float_ptr();
+  
+  int batch_size = first_epoch.shape.at(ff_dim_t{nonnegative_int{0}}).unwrap_nonnegative();
+  for (int i = 0; i < batch_size; i++) {
+    if (first_epoch_ptr[i] < last_epoch_ptr[i]) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("E2ETest") {
+    // initialize runtime
+    ManagedFFStream managed_stream{};
+    ManagedPerDeviceFFHandle managed_handle{};
+
+    Allocator allocator = create_local_cuda_memory_allocator();
+
+    // allocate label tensors
+    LossTensorSource loss_tensor_source;
+    loss_tensor_t label_tensor =
+        loss_tensor_source.new_loss_tensor();
+
+    nonnegative_int batch_size = 10_n;
+    nonnegative_int data_dim = 16_n;
+    nonnegative_int output_dim = 32_n;
+
+    TensorShape output_tensor_shape = TensorShape{
+        TensorDims{FFOrdered<nonnegative_int>{batch_size, output_dim}},
+        DataType::FLOAT};
+
+    GenericTensorAccessorW label_tensor_backing =
+        allocator.allocate_tensor(output_tensor_shape);
+    AllocatedTensors allocated_tensors = AllocatedTensors{
+        {
+         {TensorTypeVariant{label_tensor},
+         label_tensor_backing}},
+        {},
+        {}};
+
+    // construct computation graph
+    ComputationGraph computation_graph = make_empty_computation_graph();
+
+    TensorShape input_tensor_shape = TensorShape{
+        TensorDims{FFOrdered<nonnegative_int>{batch_size, data_dim}},
+        DataType::FLOAT};
+
+    TensorShape weight_shape = TensorShape{
+        TensorDims{FFOrdered<nonnegative_int>{data_dim, output_dim}},
+        DataType::FLOAT};
+
+    LayerAddedResult inputs_layer =
+        add_input_layer(computation_graph, input_tensor_shape);
+
+    LayerAddedResult weights_layer = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
+                       weight_shape, InitializerAttrs{GlorotNormalAttrs{0}}}},
+                   std::nullopt},
+        {},
+        {});
+
+    LayerAddedResult linear_operator = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim,
+                                                       /*use_bias=*/false,
+                                                       DataType::FLOAT,
+                                                       Activation::RELU,
+                                                       std::nullopt}},
+                   std::nullopt},
+        inputs_layer.outputs,
+        weights_layer.outputs);
+    tensor_guid_t logit_tensor = get_only(linear_operator.outputs);
+
+    RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
+        DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
+        EnableProfiling::YES,
+        ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}};
+
+    // initialize training backing
+    LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
+    OptimizerAttrs optimizer_attrs =
+        OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                         /*momentum=*/0.9,
+                                         /*nesterov=*/false,
+                                         /*weight_decay=*/0.001}};
+
+
+    GradientTensorSource gradient_tensor_source;
+    OptimizerTensorSource optimizer_tensor_source;
+
+    LocalTrainingBacking local_training_backing =
+        LocalTrainingBacking{allocator,
+                            allocated_tensors,
+                            gradient_tensor_source,
+                            optimizer_tensor_source,
+                            computation_graph,
+                            runtime_arg_config,
+                          optimizer_attrs};
+
+    // begin training loop                      
+    ModelTrainingInstance model_training_instance = ModelTrainingInstance{
+      allocator, local_training_backing, logit_tensor, label_tensor, loss_attrs, optimizer_attrs
+    };
+
+    int num_epochs = 10;
+    std::vector<GenericTensorAccessorW> loss_values (num_epochs);
+
+    for (int i = 0; i < num_epochs; i++) {
+      model_training_instance.forward();
+      model_training_instance.backward();
+      model_training_instance.update();
+      loss_values[i] = model_training_instance.get_loss_tensor_backing();
+    }
+    
+    // Assert that each sample in the batch has a lower loss in last epoch than the first epoch
+    CHECK(did_loss_decrease(loss_values[0], loss_values[num_epochs - 1]));
+  }
+}
diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc
index 30682c9a48..0fa841be20 100644
--- a/lib/local-execution/test/src/test_local_cost_estimator.cc
+++ b/lib/local-execution/test/src/test_local_cost_estimator.cc
@@ -9,8 +9,8 @@
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_CUDA_TEST_SUITE) {
-  TEST_CASE("Local Cost Estimator") {
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("LocalCostEstimator") {
     // local backing initialization
     ManagedPerDeviceFFHandle managed_handle{};
 
diff --git a/lib/local-execution/test/src/test_loss_functions.cc b/lib/local-execution/test/src/test_loss_functions.cc
index 2bf138e204..ae76dcccf9 100644
--- a/lib/local-execution/test/src/test_loss_functions.cc
+++ b/lib/local-execution/test/src/test_loss_functions.cc
@@ -14,7 +14,7 @@
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("Loss Functions") {
+  TEST_CASE("LossFunctions") {
     // initialize runtime
     ManagedFFStream managed_stream{};
     ManagedPerDeviceFFHandle managed_handle{};
diff --git a/lib/local-execution/test/src/test_task_registry.cc b/lib/local-execution/test/src/test_task_registry.cc
index dd4b6f5b44..16877b0e09 100644
--- a/lib/local-execution/test/src/test_task_registry.cc
+++ b/lib/local-execution/test/src/test_task_registry.cc
@@ -9,7 +9,7 @@
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("Task Registry") {
+  TEST_CASE("TaskRegistry") {
 
     layer_guid_t layer_guid = layer_guid_t{Node{0}};
     nonnegative_int embed_dim = 32_n;
diff --git a/lib/local-execution/test/src/test_update.cc b/lib/local-execution/test/src/test_update.cc
index 1f8684f38a..dcd9c025b3 100644
--- a/lib/local-execution/test/src/test_update.cc
+++ b/lib/local-execution/test/src/test_update.cc
@@ -12,7 +12,7 @@
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("Execute Update") {
+  TEST_CASE("ExecuteUpdate") {
     // initialize runtime configs
     ManagedFFStream managed_stream{};
     ManagedPerDeviceFFHandle managed_handle{};

From ffd96e2c4a341da4268384c79ffaef1eca889f70 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Wed, 30 Apr 2025 11:17:09 -0700
Subject: [PATCH 67/91] Format

---
 .../src/model_training_instance.cc            |  4 +-
 lib/local-execution/test/src/test_e2e.cc      | 61 ++++++++++---------
 2 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc
index f232011230..96a324b492 100644
--- a/lib/local-execution/src/model_training_instance.cc
+++ b/lib/local-execution/src/model_training_instance.cc
@@ -56,8 +56,8 @@ void ModelTrainingInstance::update() {
 
 GenericTensorAccessorW ModelTrainingInstance::get_loss_tensor_backing() {
   gradient_tensor_t loss_tensor =
-      this->training_backing.local_tensor_backing
-          .tensor_gradient_mapping.at(this->logit_tensor);
+      this->training_backing.local_tensor_backing.tensor_gradient_mapping.at(
+          this->logit_tensor);
   GenericTensorAccessorW loss_tensor_backing =
       this->training_backing.local_tensor_backing.tensor_backings.at(
           TensorTypeVariant{loss_tensor});
diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc
index 3f3a7ed0bc..33ffbe5f96 100644
--- a/lib/local-execution/test/src/test_e2e.cc
+++ b/lib/local-execution/test/src/test_e2e.cc
@@ -3,22 +3,24 @@
 #include "kernels/managed_per_device_ff_handle.h"
 #include "local-execution/allocated_tensors.h"
 #include "local-execution/local_training_backing.h"
+#include "local-execution/model_training_instance.h"
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
 #include "pcg/computation_graph.h"
 #include "pcg/computation_graph_builder.h"
 #include "pcg/optimizer_attrs.dtg.h"
 #include "test_utils.h"
 #include "utils/containers/get_only.h"
-#include "local-execution/model_training_instance.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
-bool did_loss_decrease(GenericTensorAccessorW const &first_epoch, GenericTensorAccessorW const & last_epoch) {
-  float* first_epoch_ptr = first_epoch.get_float_ptr();
-  float* last_epoch_ptr = last_epoch.get_float_ptr();
-  
-  int batch_size = first_epoch.shape.at(ff_dim_t{nonnegative_int{0}}).unwrap_nonnegative();
+bool did_loss_decrease(GenericTensorAccessorW const &first_epoch,
+                       GenericTensorAccessorW const &last_epoch) {
+  float *first_epoch_ptr = first_epoch.get_float_ptr();
+  float *last_epoch_ptr = last_epoch.get_float_ptr();
+
+  int batch_size =
+      first_epoch.shape.at(ff_dim_t{nonnegative_int{0}}).unwrap_nonnegative();
   for (int i = 0; i < batch_size; i++) {
     if (first_epoch_ptr[i] < last_epoch_ptr[i]) {
       return false;
@@ -28,7 +30,6 @@ bool did_loss_decrease(GenericTensorAccessorW const &first_epoch, GenericTensorA
   return true;
 }
 
-
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("E2ETest") {
     // initialize runtime
@@ -39,8 +40,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     // allocate label tensors
     LossTensorSource loss_tensor_source;
-    loss_tensor_t label_tensor =
-        loss_tensor_source.new_loss_tensor();
+    loss_tensor_t label_tensor = loss_tensor_source.new_loss_tensor();
 
     nonnegative_int batch_size = 10_n;
     nonnegative_int data_dim = 16_n;
@@ -53,11 +53,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     GenericTensorAccessorW label_tensor_backing =
         allocator.allocate_tensor(output_tensor_shape);
     AllocatedTensors allocated_tensors = AllocatedTensors{
-        {
-         {TensorTypeVariant{label_tensor},
-         label_tensor_backing}},
-        {},
-        {}};
+        {{TensorTypeVariant{label_tensor}, label_tensor_backing}}, {}, {}};
 
     // construct computation graph
     ComputationGraph computation_graph = make_empty_computation_graph();
@@ -99,33 +95,37 @@ TEST_SUITE(FF_TEST_SUITE) {
         ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}};
 
     // initialize training backing
-    LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
+    LossAttrs loss_attrs = LossAttrs{
+        NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
     OptimizerAttrs optimizer_attrs =
         OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
                                          /*momentum=*/0.9,
                                          /*nesterov=*/false,
                                          /*weight_decay=*/0.001}};
 
-
     GradientTensorSource gradient_tensor_source;
     OptimizerTensorSource optimizer_tensor_source;
 
     LocalTrainingBacking local_training_backing =
         LocalTrainingBacking{allocator,
-                            allocated_tensors,
-                            gradient_tensor_source,
-                            optimizer_tensor_source,
-                            computation_graph,
-                            runtime_arg_config,
-                          optimizer_attrs};
-
-    // begin training loop                      
-    ModelTrainingInstance model_training_instance = ModelTrainingInstance{
-      allocator, local_training_backing, logit_tensor, label_tensor, loss_attrs, optimizer_attrs
-    };
+                             allocated_tensors,
+                             gradient_tensor_source,
+                             optimizer_tensor_source,
+                             computation_graph,
+                             runtime_arg_config,
+                             optimizer_attrs};
+
+    // begin training loop
+    ModelTrainingInstance model_training_instance =
+        ModelTrainingInstance{allocator,
+                              local_training_backing,
+                              logit_tensor,
+                              label_tensor,
+                              loss_attrs,
+                              optimizer_attrs};
 
     int num_epochs = 10;
-    std::vector<GenericTensorAccessorW> loss_values (num_epochs);
+    std::vector<GenericTensorAccessorW> loss_values(num_epochs);
 
     for (int i = 0; i < num_epochs; i++) {
       model_training_instance.forward();
@@ -133,8 +133,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       model_training_instance.update();
       loss_values[i] = model_training_instance.get_loss_tensor_backing();
     }
-    
-    // Assert that each sample in the batch has a lower loss in last epoch than the first epoch
+
+    // Assert that each sample in the batch has a lower loss in last epoch than
+    // the first epoch
     CHECK(did_loss_decrease(loss_values[0], loss_values[num_epochs - 1]));
   }
 }

From 2f75451059455612aa716eb53e38c888396ca85a Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Wed, 30 Apr 2025 15:32:05 -0700
Subject: [PATCH 68/91] Pass cost estimator test

---
 .../include/local-execution/task_argument_accessor.h |  8 +++++++-
 lib/local-execution/src/local_cost_estimator.cc      | 12 ++++++------
 lib/local-execution/src/local_training_backing.cc    |  7 ++++---
 .../test/src/test_local_cost_estimator.cc            |  2 +-
 .../src/per_device_op_state.cc}                      |  0
 5 files changed, 18 insertions(+), 11 deletions(-)
 rename lib/{local-execution/src/per_device_state.cc => task-spec/src/per_device_op_state.cc} (100%)

diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h
index 99c1c1296b..285b41991a 100644
--- a/lib/local-execution/include/local-execution/task_argument_accessor.h
+++ b/lib/local-execution/include/local-execution/task_argument_accessor.h
@@ -14,7 +14,13 @@ struct TaskArgumentAccessor {
     if constexpr (PerDeviceOpState::IsPartOfPerDeviceOpState_v<T>) {
       PerDeviceOpState device_states =
           this->ptr->get_concrete_arg(slot).get<PerDeviceOpState>();
-      return device_states.get<T>();
+      if (device_states.has<T>()) {
+        return device_states.get<T>();
+      } else {
+        throw mk_runtime_error(
+            fmt::format("Invalid access to PerDeviceOpState attempted, instead it holds: ",
+                        device_states.index()));
+      }
     } else {
       return this->ptr->get_concrete_arg(slot).get<T>();
     }
diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
index 532fcc91c2..0ee6c9a987 100644
--- a/lib/local-execution/src/local_cost_estimator.cc
+++ b/lib/local-execution/src/local_cost_estimator.cc
@@ -90,12 +90,12 @@ CostDetails LocalCostEstimator::estimate_cost(
                                      computation_graph,
                                      this->runtime_arg_config);
   // execute layer
-  layer_guid_t operator_layer_guid =
-      get_layer_by_name(computation_graph, "operator");
-  float fwd =
-      execute_forward(local_backing, operator_layer_guid, allocator).value();
-  float bwd =
-      execute_backward(local_backing, operator_layer_guid, allocator).value();
+  layer_guid_t operator_layer_guid = get_layer_by_name(computation_graph, "operator");
+  
+  float fwd = execute_forward(local_backing, operator_layer_guid, allocator).value();
+  std::cout << "completed forward" << std::endl;
+  float bwd = execute_backward(local_backing, operator_layer_guid, allocator).value();
+  std::cout << "completed  backward" << std::endl;
 
   float total_execution_time = fwd + bwd;
 
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index b2e0a2fb7e..7d916715f5 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -104,8 +104,7 @@ std::optional<float> call_task_impl(TaskRegistry const &task_registry,
                                     task_id_t const &task_id,
                                     TaskArgumentAccessor const &acc) {
   TaskSignatureAndImpl task_sig_impl = task_registry.task_mapping.at(task_id);
-  auto fn =
-      task_sig_impl.impl_function.get<FwdBwdOpTaskImplFunction>().function_ptr;
+  auto fn = task_sig_impl.impl_function.get<FwdBwdOpTaskImplFunction>().function_ptr;
   return fn(acc);
 }
 
@@ -116,13 +115,15 @@ std::optional<float>
   if (registry_contains_task_for_layer(local_training_backing.task_registry,
                                        operator_node,
                                        OpTaskType::FWD)) {
+    
     ComputationGraphOpAttrs attrs =
         get_layer_attrs(local_training_backing.computation_graph, operator_node)
             .op_attrs;
-
+    
     std::optional<DeviceSpecificDeviceStates> device_state =
         get_per_device_op_state_if_exists(
             local_training_backing.local_args_backing, operator_node);
+    
     TaskInvocation invocation = lower_to_task_invocation(
         forward(attrs),
         operator_node,
diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc
index 0fa841be20..e493265f86 100644
--- a/lib/local-execution/test/src/test_local_cost_estimator.cc
+++ b/lib/local-execution/test/src/test_local_cost_estimator.cc
@@ -31,7 +31,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*kdim=*/embed_dim,
           /*vdim=*/embed_dim,
           /*dropout=*/0.0,
-          /*bias=*/true,
+          /*bias=*/false,
           /*add_bias_kv=*/false,
           /*add_zero_attn=*/false,
       };
diff --git a/lib/local-execution/src/per_device_state.cc b/lib/task-spec/src/per_device_op_state.cc
similarity index 100%
rename from lib/local-execution/src/per_device_state.cc
rename to lib/task-spec/src/per_device_op_state.cc

From 2746e141ae3dd9f4fac8f03ddfaea9fb781b5b44 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Mon, 5 May 2025 05:33:42 -0700
Subject: [PATCH 69/91] Add nccl fix and host accessor access

---
 lib/kernels/include/kernels/accessor.h        |  1 +
 .../kernels/managed_per_device_ff_handle.h    |  6 +-
 lib/kernels/src/accessor.cc                   | 10 +++
 .../src/managed_per_device_ff_handle.cc       | 22 +++++-
 lib/kernels/test/src/test_attention_kernel.cc |  2 +-
 .../test/src/test_batch_matmul_kernel.cc      |  2 +-
 .../test/src/test_batch_norm_kernel.cc        |  2 +-
 lib/kernels/test/src/test_combine_kernel.cc   |  2 +-
 lib/kernels/test/src/test_concat_kernel.cc    |  2 +-
 lib/kernels/test/src/test_dropout.cc          |  2 +-
 lib/kernels/test/src/test_flat_kernel.cc      |  2 +-
 lib/kernels/test/src/test_gather_kernels.cc   |  2 +-
 .../test/src/test_layer_norm_kernels.cc       |  2 +-
 lib/kernels/test/src/test_partition_kernel.cc |  2 +-
 lib/kernels/test/src/test_pool_2d_kernels.cc  |  2 +-
 lib/kernels/test/src/test_reduction_kernel.cc |  2 +-
 lib/kernels/test/src/test_replicate_kernel.cc |  2 +-
 lib/kernels/test/src/test_reshape_kernel.cc   |  2 +-
 lib/kernels/test/src/test_reverse_kernels.cc  |  2 +-
 lib/kernels/test/src/test_softmax_kernel.cc   |  2 +-
 lib/kernels/test/src/test_split_kernel.cc     |  2 +-
 lib/kernels/test/src/test_transpose_kernel.cc |  2 +-
 .../local-execution/model_training_instance.h |  2 +-
 .../local-execution/task_argument_accessor.h  |  6 +-
 .../src/local-execution/ops/linear.cc         |  6 +-
 .../src/local_cost_estimator.cc               | 11 ++-
 .../src/local_training_backing.cc             |  9 ++-
 .../src/model_training_instance.cc            |  4 +-
 lib/local-execution/src/optimizer.cc          |  2 +-
 lib/local-execution/test/src/test_e2e.cc      | 75 +++++++++++++------
 .../test/src/test_local_cost_estimator.cc     |  2 +-
 .../test/src/test_loss_functions.cc           |  2 +-
 lib/local-execution/test/src/test_update.cc   |  2 +-
 lib/pcg/include/pcg/computation_graph.h       |  2 +
 lib/pcg/src/pcg/computation_graph.cc          | 14 ++++
 35 files changed, 146 insertions(+), 66 deletions(-)

diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h
index 39da65c3be..55b120b090 100644
--- a/lib/kernels/include/kernels/accessor.h
+++ b/lib/kernels/include/kernels/accessor.h
@@ -75,6 +75,7 @@ std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &);
 int32_t *get_int32_ptr(GenericTensorAccessorW const &);
 int64_t *get_int64_ptr(GenericTensorAccessorW const &);
 float *get_float_ptr(GenericTensorAccessorW const &);
+void write_to_host_float_ptr(GenericTensorAccessorW const &, float *);
 double *get_double_ptr(GenericTensorAccessorW const &);
 half *get_half_ptr(GenericTensorAccessorW const &);
 std::vector<int32_t *>
diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h
index 0a83a5eecb..05e8406de8 100644
--- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h
+++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h
@@ -7,7 +7,7 @@ namespace FlexFlow {
 
 struct ManagedPerDeviceFFHandle {
 public:
-  ManagedPerDeviceFFHandle();
+  ManagedPerDeviceFFHandle(int num_ranks, int my_rank);
 
   ManagedPerDeviceFFHandle(ManagedPerDeviceFFHandle const &) = delete;
   ManagedPerDeviceFFHandle &
@@ -25,6 +25,10 @@ struct ManagedPerDeviceFFHandle {
   PerDeviceFFHandle *handle;
 };
 
+ManagedPerDeviceFFHandle initialize_single_gpu_handle();
+ManagedPerDeviceFFHandle initialize_multi_gpu_handle(int num_ranks,
+                                                     int my_rank);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc
index 27b7eb390d..7f4f61c271 100644
--- a/lib/kernels/src/accessor.cc
+++ b/lib/kernels/src/accessor.cc
@@ -1,4 +1,5 @@
 #include "kernels/accessor.h"
+#include "device.h"
 
 namespace FlexFlow {
 
@@ -76,6 +77,15 @@ float *get_float_ptr(GenericTensorAccessorW const &a) {
   return get<DataType::FLOAT>(a);
 }
 
+void write_to_host_float_ptr(GenericTensorAccessorW const &a, float *host_ptr) {
+  float *device_ptr = get<DataType::FLOAT>(a);
+  int total_elements = get_volume(a.shape).unwrap_nonnegative();
+  checkCUDA(cudaMemcpy(host_ptr,
+                       device_ptr,
+                       total_elements * sizeof(float),
+                       cudaMemcpyDeviceToHost));
+}
+
 double *get_double_ptr(GenericTensorAccessorW const &a) {
   return get<DataType::DOUBLE>(a);
 }
diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc
index c050e887b6..e327a7b1e1 100644
--- a/lib/kernels/src/managed_per_device_ff_handle.cc
+++ b/lib/kernels/src/managed_per_device_ff_handle.cc
@@ -1,9 +1,10 @@
 #include "kernels/managed_per_device_ff_handle.h"
 #include "device.h"
+#include "kernels/nccl.h"
 
 namespace FlexFlow {
 
-ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle() {
+ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle(int num_ranks, int my_rank) {
   handle = new PerDeviceFFHandle;
   handle->workSpaceSize = 1024 * 1024;
   handle->allowTensorOpMathConversion = true;
@@ -11,6 +12,13 @@ ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle() {
   checkCUDNN(cudnnCreate(&handle->dnn));
   checkCUBLAS(cublasCreate(&handle->blas));
   checkCUDA(cudaMalloc(&handle->workSpace, handle->workSpaceSize));
+
+#ifdef FF_USE_NCCL
+  ncclUniqueId ncclId;
+  checkNCCL(ncclGetUniqueId(&ncclId));
+  checkNCCL(ncclCommInitRank(
+      &handle->ncclComm, num_ranks, ncclId, my_rank)); // todo generalize
+#endif
 }
 
 ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle(
@@ -28,6 +36,9 @@ ManagedPerDeviceFFHandle::~ManagedPerDeviceFFHandle() {
     checkCUDNN(cudnnDestroy(handle->dnn));
     checkCUBLAS(cublasDestroy(handle->blas));
     checkCUDA(cudaFree(handle->workSpace));
+#ifdef FF_USE_NCCL
+    checkNCCL(ncclCommDestroy(handle->ncclComm));
+#endif
     delete handle;
   }
 }
@@ -36,4 +47,13 @@ PerDeviceFFHandle const &ManagedPerDeviceFFHandle::raw_handle() const {
   return *handle;
 }
 
+ManagedPerDeviceFFHandle initialize_single_gpu_handle() {
+  return ManagedPerDeviceFFHandle(1, 0);
+}
+
+ManagedPerDeviceFFHandle initialize_multi_gpu_handle(int num_ranks,
+                                                     int my_rank) {
+  return ManagedPerDeviceFFHandle(num_ranks, my_rank);
+}
+
 } // namespace FlexFlow
diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc
index 64264f6c39..a15497984c 100644
--- a/lib/kernels/test/src/test_attention_kernel.cc
+++ b/lib/kernels/test/src/test_attention_kernel.cc
@@ -19,7 +19,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     nonnegative_int kvSeqLength = 20_n;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc
index cacd5b60fb..b9cfbf3ec5 100644
--- a/lib/kernels/test/src/test_batch_matmul_kernel.cc
+++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc
@@ -15,7 +15,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     int seq_length = -1;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc
index b4c43cf1d8..94ce268b93 100644
--- a/lib/kernels/test/src/test_batch_norm_kernel.cc
+++ b/lib/kernels/test/src/test_batch_norm_kernel.cc
@@ -12,7 +12,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     nonnegative_int output_w = 10_n;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc
index 2b6b9bf589..68f35cb099 100644
--- a/lib/kernels/test/src/test_combine_kernel.cc
+++ b/lib/kernels/test/src/test_combine_kernel.cc
@@ -5,7 +5,7 @@
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test combine kernel") {
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc
index 215e599716..ca6b95dadc 100644
--- a/lib/kernels/test/src/test_concat_kernel.cc
+++ b/lib/kernels/test/src/test_concat_kernel.cc
@@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     nonnegative_int size_per_input = 100_n;
     ff_dim_t concat_axis = ff_dim_t{0_n};
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     TensorShape input_shape =
diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc
index 86f8f2102b..7e78544df8 100644
--- a/lib/kernels/test/src/test_dropout.cc
+++ b/lib/kernels/test/src/test_dropout.cc
@@ -18,7 +18,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorShape output_shape = input_shape;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc
index 83f7f0445e..c9e1778843 100644
--- a/lib/kernels/test/src/test_flat_kernel.cc
+++ b/lib/kernels/test/src/test_flat_kernel.cc
@@ -7,7 +7,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Flat Kernel") {
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc
index 1a8cf5f82a..ffe8e0dfd2 100644
--- a/lib/kernels/test/src/test_gather_kernels.cc
+++ b/lib/kernels/test/src/test_gather_kernels.cc
@@ -5,7 +5,7 @@
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Gather Forward and Backward Kernel") {
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc
index 5386c1d943..9e89c86433 100644
--- a/lib/kernels/test/src/test_layer_norm_kernels.cc
+++ b/lib/kernels/test/src/test_layer_norm_kernels.cc
@@ -17,7 +17,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorShape feature_shape =
         make_float_tensor_shape_from_legion_dims({feature_size});
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc
index 4fd1b53210..281a146a30 100644
--- a/lib/kernels/test/src/test_partition_kernel.cc
+++ b/lib/kernels/test/src/test_partition_kernel.cc
@@ -6,7 +6,7 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Partition Forward and Backward") {
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc
index 62b61707c6..874e2b8d98 100644
--- a/lib/kernels/test/src/test_pool_2d_kernels.cc
+++ b/lib/kernels/test/src/test_pool_2d_kernels.cc
@@ -22,7 +22,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     PoolOp pool_type = PoolOp::MAX;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc
index 04a3817b84..7f993c12d3 100644
--- a/lib/kernels/test/src/test_reduction_kernel.cc
+++ b/lib/kernels/test/src/test_reduction_kernel.cc
@@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorShape input_shape = make_float_tensor_shape_from_legion_dims(
         {10_n, 10_n, 10_n, 10_n, 10_n});
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index fa726898f2..8c47c2a49a 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
     TensorShape output_shape = input_shape;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc
index d329a347b3..1e969f6d82 100644
--- a/lib/kernels/test/src/test_reshape_kernel.cc
+++ b/lib/kernels/test/src/test_reshape_kernel.cc
@@ -5,7 +5,7 @@
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Reshape Forward and Backward") {
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index 9c8475f6d6..ba808c491a 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -12,7 +12,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
     TensorShape output_shape = input_shape;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc
index c9eaa76b86..cba293aed1 100644
--- a/lib/kernels/test/src/test_softmax_kernel.cc
+++ b/lib/kernels/test/src/test_softmax_kernel.cc
@@ -12,7 +12,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     nonnegative_int input_w = 100_n;
     nonnegative_int channels = 100_n;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc
index ea0d280f68..65d1ed7783 100644
--- a/lib/kernels/test/src/test_split_kernel.cc
+++ b/lib/kernels/test/src/test_split_kernel.cc
@@ -12,7 +12,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     coord_t in_blk_size = 100;
     coord_t num_blks = 1;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc
index 02d99c86a1..f7007d76e4 100644
--- a/lib/kernels/test/src/test_transpose_kernel.cc
+++ b/lib/kernels/test/src/test_transpose_kernel.cc
@@ -12,7 +12,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         },
     };
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h
index 54b76313ab..2deed6b0a2 100644
--- a/lib/local-execution/include/local-execution/model_training_instance.h
+++ b/lib/local-execution/include/local-execution/model_training_instance.h
@@ -30,7 +30,7 @@ struct ModelTrainingInstance {
   PerLayerElapsedTime forward();
   PerLayerElapsedTime backward();
   void update();
-  GenericTensorAccessorW get_loss_tensor_backing();
+  void write_loss_tensor_to_host(float *host_ptr);
 };
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h
index 285b41991a..499b5ff7d6 100644
--- a/lib/local-execution/include/local-execution/task_argument_accessor.h
+++ b/lib/local-execution/include/local-execution/task_argument_accessor.h
@@ -17,9 +17,9 @@ struct TaskArgumentAccessor {
       if (device_states.has<T>()) {
         return device_states.get<T>();
       } else {
-        throw mk_runtime_error(
-            fmt::format("Invalid access to PerDeviceOpState attempted, instead it holds: ",
-                        device_states.index()));
+        throw mk_runtime_error(fmt::format(
+            "Invalid access to PerDeviceOpState attempted, instead it holds: ",
+            device_states.index()));
       }
     } else {
       return this->ptr->get_concrete_arg(slot).get<T>();
diff --git a/lib/local-execution/src/local-execution/ops/linear.cc b/lib/local-execution/src/local-execution/ops/linear.cc
index 94f92d37ee..768293b32f 100644
--- a/lib/local-execution/src/local-execution/ops/linear.cc
+++ b/lib/local-execution/src/local-execution/ops/linear.cc
@@ -89,7 +89,6 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  auto bias = acc.get_tensor<Permissions::RO>(BIAS);
 
   auto per_device_state =
       acc.get_argument<LinearPerDeviceState>(PER_DEVICE_STATE);
@@ -102,6 +101,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   float const *bias_ptr = NULL;
   if (attrs.use_bias) {
+    auto bias = acc.get_tensor<Permissions::RO>(BIAS);
     bias_ptr = bias.get_float_ptr();
   }
 
@@ -118,14 +118,11 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  batch_size.unwrap_nonnegative());
 }
 
-;
-
 static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  auto bias = acc.get_tensor<Permissions::RO>(BIAS);
 
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto weight_grad = acc.get_tensor_grad<Permissions::RW>(WEIGHT);
@@ -137,6 +134,7 @@ static std::optional<float>
 
   float const *bias_ptr = NULL;
   if (attrs.use_bias) {
+    auto bias = acc.get_tensor<Permissions::RO>(BIAS);
     bias_ptr = bias.get_float_ptr();
   }
 
diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
index 0ee6c9a987..0a84c19066 100644
--- a/lib/local-execution/src/local_cost_estimator.cc
+++ b/lib/local-execution/src/local_cost_estimator.cc
@@ -90,11 +90,14 @@ CostDetails LocalCostEstimator::estimate_cost(
                                      computation_graph,
                                      this->runtime_arg_config);
   // execute layer
-  layer_guid_t operator_layer_guid = get_layer_by_name(computation_graph, "operator");
-  
-  float fwd = execute_forward(local_backing, operator_layer_guid, allocator).value();
+  layer_guid_t operator_layer_guid =
+      get_layer_by_name(computation_graph, "operator");
+
+  float fwd =
+      execute_forward(local_backing, operator_layer_guid, allocator).value();
   std::cout << "completed forward" << std::endl;
-  float bwd = execute_backward(local_backing, operator_layer_guid, allocator).value();
+  float bwd =
+      execute_backward(local_backing, operator_layer_guid, allocator).value();
   std::cout << "completed  backward" << std::endl;
 
   float total_execution_time = fwd + bwd;
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index 7d916715f5..d508c34210 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -104,7 +104,8 @@ std::optional<float> call_task_impl(TaskRegistry const &task_registry,
                                     task_id_t const &task_id,
                                     TaskArgumentAccessor const &acc) {
   TaskSignatureAndImpl task_sig_impl = task_registry.task_mapping.at(task_id);
-  auto fn = task_sig_impl.impl_function.get<FwdBwdOpTaskImplFunction>().function_ptr;
+  auto fn =
+      task_sig_impl.impl_function.get<FwdBwdOpTaskImplFunction>().function_ptr;
   return fn(acc);
 }
 
@@ -115,15 +116,15 @@ std::optional<float>
   if (registry_contains_task_for_layer(local_training_backing.task_registry,
                                        operator_node,
                                        OpTaskType::FWD)) {
-    
+
     ComputationGraphOpAttrs attrs =
         get_layer_attrs(local_training_backing.computation_graph, operator_node)
             .op_attrs;
-    
+
     std::optional<DeviceSpecificDeviceStates> device_state =
         get_per_device_op_state_if_exists(
             local_training_backing.local_args_backing, operator_node);
-    
+
     TaskInvocation invocation = lower_to_task_invocation(
         forward(attrs),
         operator_node,
diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc
index 96a324b492..e58b5dfe7d 100644
--- a/lib/local-execution/src/model_training_instance.cc
+++ b/lib/local-execution/src/model_training_instance.cc
@@ -54,14 +54,14 @@ void ModelTrainingInstance::update() {
       get_optimizer_attrs_for_next_iter(this->optimizer_attrs);
 }
 
-GenericTensorAccessorW ModelTrainingInstance::get_loss_tensor_backing() {
+void ModelTrainingInstance::write_loss_tensor_to_host(float *host_ptr) {
   gradient_tensor_t loss_tensor =
       this->training_backing.local_tensor_backing.tensor_gradient_mapping.at(
           this->logit_tensor);
   GenericTensorAccessorW loss_tensor_backing =
       this->training_backing.local_tensor_backing.tensor_backings.at(
           TensorTypeVariant{loss_tensor});
-  return loss_tensor_backing;
+  write_to_host_float_ptr(loss_tensor_backing, host_ptr);
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc
index 1b9ce83d14..1b8fc37b2d 100644
--- a/lib/local-execution/src/optimizer.cc
+++ b/lib/local-execution/src/optimizer.cc
@@ -70,7 +70,7 @@ static void sgd_update_task_impl(TaskArgumentAccessor const &acc) {
   int size = weight_grad.shape.get_volume().unwrap_nonnegative();
 
   assert(weight_grad.shape.get_volume().unwrap_nonnegative() &
-         weight.shape.get_volume().unwrap_nonnegative() == 0);
+         weight.shape.get_volume().unwrap_nonnegative());
   int num_replicas = weight_grad.shape.get_volume().unwrap_nonnegative() /
                      weight.shape.get_volume().unwrap_nonnegative();
 
diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc
index 33ffbe5f96..5791a94cbb 100644
--- a/lib/local-execution/test/src/test_e2e.cc
+++ b/lib/local-execution/test/src/test_e2e.cc
@@ -14,19 +14,12 @@
 
 using namespace ::FlexFlow;
 
-bool did_loss_decrease(GenericTensorAccessorW const &first_epoch,
-                       GenericTensorAccessorW const &last_epoch) {
-  float *first_epoch_ptr = first_epoch.get_float_ptr();
-  float *last_epoch_ptr = last_epoch.get_float_ptr();
-
-  int batch_size =
-      first_epoch.shape.at(ff_dim_t{nonnegative_int{0}}).unwrap_nonnegative();
+bool did_loss_decrease(float *first_epoch, float *last_epoch, int batch_size) {
   for (int i = 0; i < batch_size; i++) {
-    if (first_epoch_ptr[i] < last_epoch_ptr[i]) {
+    if (first_epoch[i] < last_epoch[i]) {
       return false;
     }
   }
-
   return true;
 }
 
@@ -34,7 +27,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("E2ETest") {
     // initialize runtime
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
@@ -44,7 +37,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     nonnegative_int batch_size = 10_n;
     nonnegative_int data_dim = 16_n;
-    nonnegative_int output_dim = 32_n;
+    nonnegative_int hidden_dim = 32_n;
+    nonnegative_int output_dim = 1_n;
 
     TensorShape output_tensor_shape = TensorShape{
         TensorDims{FFOrdered<nonnegative_int>{batch_size, output_dim}},
@@ -62,32 +56,55 @@ TEST_SUITE(FF_TEST_SUITE) {
         TensorDims{FFOrdered<nonnegative_int>{batch_size, data_dim}},
         DataType::FLOAT};
 
-    TensorShape weight_shape = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{data_dim, output_dim}},
+    TensorShape weight_shape_1 = TensorShape{
+        TensorDims{FFOrdered<nonnegative_int>{data_dim, hidden_dim}},
+        DataType::FLOAT};
+    TensorShape weight_shape_2 = TensorShape{
+        TensorDims{FFOrdered<nonnegative_int>{hidden_dim, output_dim}},
         DataType::FLOAT};
 
     LayerAddedResult inputs_layer =
-        add_input_layer(computation_graph, input_tensor_shape);
+        add_input_layer_with_grad(computation_graph, input_tensor_shape);
 
-    LayerAddedResult weights_layer = add_layer(
+    LayerAddedResult weights_layer_1 = add_layer(
         computation_graph,
         LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
-                       weight_shape, InitializerAttrs{GlorotNormalAttrs{0}}}},
+                       weight_shape_1, InitializerAttrs{GlorotNormalAttrs{0}}}},
                    std::nullopt},
         {},
         {});
 
-    LayerAddedResult linear_operator = add_layer(
+    LayerAddedResult weights_layer_2 = add_layer(
         computation_graph,
-        LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim,
+        LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
+                       weight_shape_2, InitializerAttrs{GlorotNormalAttrs{0}}}},
+                   std::nullopt},
+        {},
+        {});
+
+    LayerAddedResult linear_operator_1 = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{hidden_dim,
                                                        /*use_bias=*/false,
                                                        DataType::FLOAT,
                                                        Activation::RELU,
                                                        std::nullopt}},
                    std::nullopt},
         inputs_layer.outputs,
-        weights_layer.outputs);
-    tensor_guid_t logit_tensor = get_only(linear_operator.outputs);
+        weights_layer_1.outputs);
+
+    LayerAddedResult linear_operator_2 = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim,
+                                                       /*use_bias=*/false,
+                                                       DataType::FLOAT,
+                                                       Activation::RELU,
+                                                       std::nullopt}},
+                   std::nullopt},
+        linear_operator_1.outputs,
+        weights_layer_2.outputs);
+
+    tensor_guid_t logit_tensor = get_only(linear_operator_2.outputs);
 
     RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
         DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
@@ -124,18 +141,28 @@ TEST_SUITE(FF_TEST_SUITE) {
                               loss_attrs,
                               optimizer_attrs};
 
-    int num_epochs = 10;
-    std::vector<GenericTensorAccessorW> loss_values(num_epochs);
+    int num_epochs = 5;
+    int num_samples = batch_size.unwrap_nonnegative();
+    std::vector<float *> loss_values(num_epochs);
 
     for (int i = 0; i < num_epochs; i++) {
       model_training_instance.forward();
       model_training_instance.backward();
       model_training_instance.update();
-      loss_values[i] = model_training_instance.get_loss_tensor_backing();
+      float *host_loss_ptr = new float[num_samples];
+      model_training_instance.write_loss_tensor_to_host(host_loss_ptr);
+      loss_values[i] = host_loss_ptr;
     }
 
     // Assert that each sample in the batch has a lower loss in last epoch than
     // the first epoch
-    CHECK(did_loss_decrease(loss_values[0], loss_values[num_epochs - 1]));
+    float *first_epoch = loss_values[0];
+    float *last_epoch = loss_values[num_epochs - 1];
+    CHECK(did_loss_decrease(
+        first_epoch, last_epoch, batch_size.unwrap_nonnegative()));
+
+    for (int i = 0; i < num_epochs; i++) {
+      delete[] loss_values[i];
+    }
   }
 }
diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc
index e493265f86..c9c5afe04e 100644
--- a/lib/local-execution/test/src/test_local_cost_estimator.cc
+++ b/lib/local-execution/test/src/test_local_cost_estimator.cc
@@ -12,7 +12,7 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("LocalCostEstimator") {
     // local backing initialization
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
 
     RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
         DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
diff --git a/lib/local-execution/test/src/test_loss_functions.cc b/lib/local-execution/test/src/test_loss_functions.cc
index ae76dcccf9..ca2482653b 100644
--- a/lib/local-execution/test/src/test_loss_functions.cc
+++ b/lib/local-execution/test/src/test_loss_functions.cc
@@ -17,7 +17,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("LossFunctions") {
     // initialize runtime
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
diff --git a/lib/local-execution/test/src/test_update.cc b/lib/local-execution/test/src/test_update.cc
index dcd9c025b3..75ba517d1b 100644
--- a/lib/local-execution/test/src/test_update.cc
+++ b/lib/local-execution/test/src/test_update.cc
@@ -15,7 +15,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ExecuteUpdate") {
     // initialize runtime configs
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
 
     Allocator allocator = create_local_cuda_memory_allocator();
     AllocatedTensors allocated_tensors = make_empty_allocated_tensors();
diff --git a/lib/pcg/include/pcg/computation_graph.h b/lib/pcg/include/pcg/computation_graph.h
index efc955ec92..60e825c11a 100644
--- a/lib/pcg/include/pcg/computation_graph.h
+++ b/lib/pcg/include/pcg/computation_graph.h
@@ -24,6 +24,8 @@ LayerAddedResult add_layer(
 
 LayerAddedResult add_input_layer(ComputationGraph &computation_graph,
                                  TensorShape const &tensor_shape);
+LayerAddedResult add_input_layer_with_grad(ComputationGraph &computation_graph,
+                                           TensorShape const &tensor_shape);
 
 TensorAttrs get_tensor_attrs(ComputationGraph const &, tensor_guid_t const &);
 bool are_tensor_guid_shapes_equivalent(ComputationGraph const &cg,
diff --git a/lib/pcg/src/pcg/computation_graph.cc b/lib/pcg/src/pcg/computation_graph.cc
index 200410dd7b..b8917eed35 100644
--- a/lib/pcg/src/pcg/computation_graph.cc
+++ b/lib/pcg/src/pcg/computation_graph.cc
@@ -100,6 +100,20 @@ LayerAddedResult add_input_layer(ComputationGraph &cg,
                    /*outputs=*/std::vector{CreateGrad::NO});
 }
 
+LayerAddedResult add_input_layer_with_grad(ComputationGraph &cg,
+                                           TensorShape const &tensor_shape) {
+  LayerAttrs layer_attrs = LayerAttrs{
+      /*op_attrs=*/ComputationGraphOpAttrs{InputAttrs{tensor_shape}},
+      /*name=*/std::nullopt,
+  };
+
+  return add_layer(cg,
+                   layer_attrs,
+                   /*inputs=*/{},
+                   /*weights=*/{},
+                   /*outputs=*/std::vector{CreateGrad::YES});
+}
+
 TensorAttrs get_tensor_attrs(ComputationGraph const &cg,
                              tensor_guid_t const &t) {
   return cg.raw_graph.at(t.raw_graph_output);

From 31df7223cae1ef0d59ac2a0ba07444d0795d0c2f Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Thu, 8 May 2025 02:49:42 -0700
Subject: [PATCH 70/91] Move operators into task-spec

---
 .proj.toml                                    |   9 +-
 .../include/kernels/optimizer_kernels.h       |   3 -
 lib/kernels/src/accessor.cc                   | 202 ------------------
 lib/kernels/src/allocation.cc                 |  21 --
 lib/kernels/src/kernels/accessor.cc           |  17 ++
 .../src/managed_per_device_ff_handle.cc       |  20 +-
 .../allocated_tensors.struct.toml             |   1 -
 .../local_task_argument_accessor.h            |   2 +-
 .../local_tensor_backing.struct.toml          |   3 -
 .../include/local-execution/loss_functions.h  |   2 +-
 .../local-execution/loss_tensor_source.h      |   3 +-
 .../include/local-execution/optimizer.h       |   2 +-
 .../local-execution/task_registry.struct.toml |   2 +-
 lib/local-execution/src/allocated_tensors.cc  |   6 +-
 .../src/local_training_backing.cc             |   2 +-
 lib/local-execution/src/loss_tensor_source.cc |   2 +-
 lib/local-execution/src/task_registry.cc      |   2 +-
 lib/local-execution/test/CMakeLists.txt       |   7 +-
 .../test/modify_test_commands.cmake           |  21 --
 lib/local-execution/test/src/test_e2e.cc      |  11 +-
 .../test/src/test_local_task_arg_accessor.cc  |   2 +-
 .../test/src/test_task_registry.cc            |   2 +-
 lib/pcg/include/pcg/metric_attrs.h            |   2 +-
 lib/pcg/src/pcg/metric_attrs.cc               |   4 +-
 .../fwd_bwd_op_task_impl_function.h           |   6 +-
 .../task-spec}/generic_task_impl_function.h   |   6 +-
 .../task-spec}/init_op_task_impl_function.h   |   6 +-
 .../task-spec}/itask_argument_accessor.h      |   6 +-
 .../task-spec/loss_tensor_t.struct.toml       |   5 +-
 .../include/task-spec}/ops/attention.h        |   6 +-
 .../include/task-spec}/ops/batch_matmul.h     |   6 +-
 .../include/task-spec}/ops/batch_norm.h       |   2 +-
 .../include/task-spec}/ops/cast.h             |   2 +-
 .../include/task-spec}/ops/combine.h          |   2 +-
 .../include/task-spec}/ops/concat.h           |   2 +-
 .../include/task-spec}/ops/conv_2d.h          |   2 +-
 .../include/task-spec}/ops/dropout.h          |   2 +-
 .../include/task-spec}/ops/element_binary.h   |   8 +-
 .../include/task-spec}/ops/element_unary.h    |   2 +-
 .../include/task-spec}/ops/embedding.h        |   2 +-
 .../include/task-spec}/ops/flat.h             |   2 +-
 .../include/task-spec}/ops/gather.h           |   2 +-
 .../include/task-spec}/ops/input.h            |   0
 .../include/task-spec}/ops/layer_norm.h       |   2 +-
 .../include/task-spec}/ops/linear.h           |   2 +-
 .../include/task-spec}/ops/noop.h             |   0
 .../include/task-spec}/ops/parallel_op.h      |   0
 .../include/task-spec}/ops/pool_2d.h          |   2 +-
 .../include/task-spec}/ops/reduce.h           |   2 +-
 .../include/task-spec}/ops/reduction.h        |   2 +-
 .../include/task-spec}/ops/repartition.h      |   2 +-
 .../include/task-spec}/ops/replicate.h        |   2 +-
 .../include/task-spec}/ops/reshape.h          |   2 +-
 .../include/task-spec}/ops/reverse.h          |   2 +-
 .../include/task-spec}/ops/softmax.h          |   6 +-
 .../include/task-spec}/ops/split.h            |   2 +-
 .../include/task-spec}/ops/topk.h             |   2 +-
 .../include/task-spec}/ops/transpose.h        |   2 +-
 .../include/task-spec}/ops/weight.h           |   0
 .../include/task-spec}/permissions.h          |   4 +-
 .../task-spec}/privilege_tensor_accessor.h    |   6 +-
 .../task-spec}/task_argument_accessor.h       |   6 +-
 .../task_impl_function.variant.toml           |   6 +-
 .../include/task-spec}/task_signature_impl.h  |   6 +-
 .../task_signature_impl.struct.toml           |   2 +-
 .../src/{ => task-spec}/concrete_arg.cc       |   0
 .../fwd_bwd_op_task_impl_function.cc          |   2 +-
 .../task-spec}/generic_task_impl_function.cc  |   2 +-
 .../task-spec}/init_op_task_impl_function.cc  |   2 +-
 .../src/task-spec/itask_argument_accessor.cc  |   1 +
 .../src/{ => task-spec}/op_arg_ref.cc         |   0
 .../src/{ => task-spec}/op_arg_spec.cc        |   0
 .../src/{ => task-spec}/op_task_invocation.cc |   0
 .../src/{ => task-spec}/op_task_signature.cc  |   0
 .../op_task_to_task_invocation.cc             |   0
 .../src/{ => task-spec}/op_tensor_spec.cc     |   0
 .../src/task-spec}/ops/attention.cc           |   2 +-
 .../src/task-spec}/ops/batch_matmul.cc        |   2 +-
 .../src/task-spec}/ops/batch_norm.cc          |   2 +-
 .../src/task-spec}/ops/cast.cc                |   2 +-
 .../src/task-spec}/ops/combine.cc             |   2 +-
 .../src/task-spec}/ops/concat.cc              |   2 +-
 .../src/task-spec}/ops/conv_2d.cc             |   2 +-
 .../src/task-spec}/ops/dropout.cc             |   2 +-
 .../src/task-spec}/ops/element_binary.cc      |   4 +-
 .../src/task-spec}/ops/element_unary.cc       |   2 +-
 .../src/task-spec}/ops/flat.cc                |   2 +-
 .../src/task-spec}/ops/gather.cc              |   2 +-
 .../src/task-spec}/ops/input.cc               |   2 +-
 .../src/task-spec}/ops/layer_norm.cc          |   2 +-
 .../src/task-spec}/ops/linear.cc              |   4 +-
 .../src/task-spec}/ops/noop.cc                |   2 +-
 .../src/task-spec}/ops/pool_2d.cc             |   3 +-
 .../src/task-spec}/ops/reduce.cc              |   3 +-
 .../src/task-spec}/ops/reduction.cc           |   2 +-
 .../src/task-spec}/ops/repartition.cc         |   2 +-
 .../src/task-spec}/ops/replicate.cc           |   2 +-
 .../src/task-spec}/ops/reshape.cc             |   2 +-
 .../src/task-spec}/ops/reverse.cc             |   2 +-
 .../src/task-spec}/ops/softmax.cc             |   2 +-
 .../src/task-spec}/ops/split.cc               |   2 +-
 .../src/task-spec}/ops/topk.cc                |   2 +-
 .../src/task-spec}/ops/transpose.cc           |   2 +-
 .../src/task-spec}/ops/weight.cc              |   2 +-
 .../{ => task-spec}/per_device_op_state.cc    |   0
 .../src/task-spec}/permissions.cc             |   2 +-
 .../task-spec/privilege_tensor_accessor.cc    |   1 +
 .../src/{ => task-spec}/runtime_arg_ref.cc    |   0
 .../src/{ => task-spec}/task_arg_spec.cc      |   0
 .../src/task-spec/task_argument_accessor.cc   |   1 +
 .../src/{ => task-spec}/task_invocation.cc    |   0
 .../src/{ => task-spec}/task_signature.cc     |   0
 .../src/task-spec}/task_signature_impl.cc     |  60 +++---
 .../{ => task-spec}/variadic_tensor_ref.cc    |   0
 lib/task-spec/test/CMakeLists.txt             |  14 ++
 lib/task-spec/test/src/task-spec/arg_ref.cc   |  33 +++
 116 files changed, 247 insertions(+), 410 deletions(-)
 delete mode 100644 lib/kernels/src/accessor.cc
 delete mode 100644 lib/kernels/src/allocation.cc
 delete mode 100644 lib/local-execution/test/modify_test_commands.cmake
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/fwd_bwd_op_task_impl_function.h (79%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/generic_task_impl_function.h (80%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/init_op_task_impl_function.h (81%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/itask_argument_accessor.h (82%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/attention.h (79%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/batch_matmul.h (75%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/batch_norm.h (93%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/cast.h (95%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/combine.h (91%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/concat.h (91%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/conv_2d.h (92%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/dropout.h (93%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/element_binary.h (73%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/element_unary.h (93%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/embedding.h (91%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/flat.h (90%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/gather.h (92%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/input.h (100%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/layer_norm.h (93%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/linear.h (92%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/noop.h (100%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/parallel_op.h (100%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/pool_2d.h (92%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/reduce.h (93%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/reduction.h (92%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/repartition.h (93%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/replicate.h (91%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/reshape.h (92%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/reverse.h (91%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/softmax.h (78%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/split.h (90%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/topk.h (92%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/transpose.h (91%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/weight.h (100%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/permissions.h (90%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/privilege_tensor_accessor.h (81%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/task_argument_accessor.h (96%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/task_impl_function.variant.toml (72%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/task_signature_impl.h (71%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/task_signature_impl.struct.toml (86%)
 rename lib/task-spec/src/{ => task-spec}/concrete_arg.cc (100%)
 rename lib/{local-execution/src => task-spec/src/task-spec}/fwd_bwd_op_task_impl_function.cc (96%)
 rename lib/{local-execution/src => task-spec/src/task-spec}/generic_task_impl_function.cc (96%)
 rename lib/{local-execution/src => task-spec/src/task-spec}/init_op_task_impl_function.cc (96%)
 create mode 100644 lib/task-spec/src/task-spec/itask_argument_accessor.cc
 rename lib/task-spec/src/{ => task-spec}/op_arg_ref.cc (100%)
 rename lib/task-spec/src/{ => task-spec}/op_arg_spec.cc (100%)
 rename lib/task-spec/src/{ => task-spec}/op_task_invocation.cc (100%)
 rename lib/task-spec/src/{ => task-spec}/op_task_signature.cc (100%)
 rename lib/task-spec/src/{ => task-spec}/op_task_to_task_invocation.cc (100%)
 rename lib/task-spec/src/{ => task-spec}/op_tensor_spec.cc (100%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/attention.cc (99%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/batch_matmul.cc (99%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/batch_norm.cc (99%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/cast.cc (98%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/combine.cc (98%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/concat.cc (98%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/conv_2d.cc (99%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/dropout.cc (99%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/element_binary.cc (98%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/element_unary.cc (99%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/flat.cc (98%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/gather.cc (99%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/input.cc (76%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/layer_norm.cc (99%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/linear.cc (98%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/noop.cc (95%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/pool_2d.cc (99%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/reduce.cc (99%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/reduction.cc (98%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/repartition.cc (99%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/replicate.cc (98%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/reshape.cc (99%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/reverse.cc (98%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/softmax.cc (99%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/split.cc (99%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/topk.cc (99%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/transpose.cc (98%)
 rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/weight.cc (76%)
 rename lib/task-spec/src/{ => task-spec}/per_device_op_state.cc (100%)
 rename lib/{local-execution/src => task-spec/src/task-spec}/permissions.cc (97%)
 create mode 100644 lib/task-spec/src/task-spec/privilege_tensor_accessor.cc
 rename lib/task-spec/src/{ => task-spec}/runtime_arg_ref.cc (100%)
 rename lib/task-spec/src/{ => task-spec}/task_arg_spec.cc (100%)
 create mode 100644 lib/task-spec/src/task-spec/task_argument_accessor.cc
 rename lib/task-spec/src/{ => task-spec}/task_invocation.cc (100%)
 rename lib/task-spec/src/{ => task-spec}/task_signature.cc (100%)
 rename lib/{local-execution/src => task-spec/src/task-spec}/task_signature_impl.cc (93%)
 rename lib/task-spec/src/{ => task-spec}/variadic_tensor_ref.cc (100%)
 create mode 100644 lib/task-spec/test/CMakeLists.txt
 create mode 100644 lib/task-spec/test/src/task-spec/arg_ref.cc

diff --git a/.proj.toml b/.proj.toml
index 8eed6166cd..3a120ca553 100644
--- a/.proj.toml
+++ b/.proj.toml
@@ -56,13 +56,20 @@ has-cpu-only-benchmarks = false
 has-cuda-tests = false
 has-cuda-benchmarks = false
 
-[targets.local-execution]
+[targets.task_spec]
 type = "lib"
 has-cpu-only-tests = true
 has-cpu-only-benchmarks = false
 has-cuda-tests = false
 has-cuda-benchmarks = false
 
+[targets.local-execution]
+type = "lib"
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = false
+has-cuda-tests = true
+has-cuda-benchmarks = false
+
 [targets.models]
 type = "lib"
 has-cpu-only-tests = true
diff --git a/lib/kernels/include/kernels/optimizer_kernels.h b/lib/kernels/include/kernels/optimizer_kernels.h
index e5f8d243a1..39284b4a6f 100644
--- a/lib/kernels/include/kernels/optimizer_kernels.h
+++ b/lib/kernels/include/kernels/optimizer_kernels.h
@@ -36,8 +36,6 @@ void adam_ps_update_task_gpu(ffStream_t,
                              float beta2,
                              float weight_decay,
                              float epsilon,
-                             size_t size,
-                             int num_replicas,
                              float const *weight_grad_ptr,
                              size_t size,
                              int num_replicas,
@@ -54,7 +52,6 @@ void adam_nccl_update_task_gpu(ffStream_t,
                                size_t size,
                                PerDeviceFFHandle const &,
                                float const *weight_grad_ptr,
-                               size_t size,
                                float *weight_ptr,
                                float *adam_v_ptr,
                                float *adam_m_ptr);
diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc
deleted file mode 100644
index 7f4f61c271..0000000000
--- a/lib/kernels/src/accessor.cc
+++ /dev/null
@@ -1,202 +0,0 @@
-#include "kernels/accessor.h"
-#include "device.h"
-
-namespace FlexFlow {
-
-int32_t *GenericTensorAccessorW::get_int32_ptr() const {
-  return this->get<DataType::INT32>();
-}
-
-int64_t *GenericTensorAccessorW::get_int64_ptr() const {
-  return this->get<DataType::INT64>();
-}
-
-float *GenericTensorAccessorW::get_float_ptr() const {
-  return this->get<DataType::FLOAT>();
-}
-
-double *GenericTensorAccessorW::get_double_ptr() const {
-  return this->get<DataType::DOUBLE>();
-}
-
-half *GenericTensorAccessorW::get_half_ptr() const {
-  return this->get<DataType::HALF>();
-}
-
-std::string format_as(GenericTensorAccessorW const &a) {
-  return fmt::format("<GenericTensorAccessorW data_type={} shape={} ptr={}>",
-                     a.data_type,
-                     a.shape,
-                     a.ptr);
-}
-
-std::ostream &operator<<(std::ostream &s, GenericTensorAccessorW const &a) {
-  return (s << fmt::to_string(a));
-}
-
-int32_t const *GenericTensorAccessorR::get_int32_ptr() const {
-  return this->get<DataType::INT32>();
-}
-
-int64_t const *GenericTensorAccessorR::get_int64_ptr() const {
-  return this->get<DataType::INT64>();
-}
-
-float const *GenericTensorAccessorR::get_float_ptr() const {
-  return this->get<DataType::FLOAT>();
-}
-
-double const *GenericTensorAccessorR::get_double_ptr() const {
-  return this->get<DataType::DOUBLE>();
-}
-
-half const *GenericTensorAccessorR::get_half_ptr() const {
-  return get<DataType::HALF>();
-}
-
-std::string format_as(GenericTensorAccessorR const &a) {
-  return fmt::format("<GenericTensorAccessorR data_type={} shape={} ptr={}>",
-                     a.data_type,
-                     a.shape,
-                     a.ptr);
-}
-
-std::ostream &operator<<(std::ostream &s, GenericTensorAccessorR const &a) {
-  return (s << fmt::to_string(a));
-}
-
-int32_t *get_int32_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::INT32>(a);
-}
-
-int64_t *get_int64_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::INT64>(a);
-}
-
-float *get_float_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::FLOAT>(a);
-}
-
-void write_to_host_float_ptr(GenericTensorAccessorW const &a, float *host_ptr) {
-  float *device_ptr = get<DataType::FLOAT>(a);
-  int total_elements = get_volume(a.shape).unwrap_nonnegative();
-  checkCUDA(cudaMemcpy(host_ptr,
-                       device_ptr,
-                       total_elements * sizeof(float),
-                       cudaMemcpyDeviceToHost));
-}
-
-double *get_double_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::DOUBLE>(a);
-}
-
-half *get_half_ptr(GenericTensorAccessorW const &a) {
-  return get<DataType::HALF>(a);
-}
-
-std::vector<int32_t *>
-    get_int32_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::INT32>(a);
-}
-
-std::vector<int64_t *>
-    get_int64_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::INT64>(a);
-}
-
-std::vector<float *>
-    get_float_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::FLOAT>(a);
-}
-
-std::vector<double *>
-    get_double_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::DOUBLE>(a);
-}
-
-std::vector<half *>
-    get_half_ptrs(std::vector<GenericTensorAccessorW> const &a) {
-  return get<DataType::HALF>(a);
-}
-
-int32_t const *get_int32_ptr(GenericTensorAccessorR const &a) {
-  return get<DataType::INT32>(a);
-}
-
-int64_t const *get_int64_ptr(GenericTensorAccessorR const &a) {
-  return get<DataType::INT64>(a);
-}
-
-float const *get_float_ptr(GenericTensorAccessorR const &a) {
-  return get<DataType::FLOAT>(a);
-}
-
-double const *get_double_ptr(GenericTensorAccessorR const &a) {
-  return get<DataType::DOUBLE>(a);
-}
-
-half const *get_half_ptr(GenericTensorAccessorR const &a) {
-  return get<DataType::HALF>(a);
-}
-
-std::vector<int32_t const *>
-    get_int32_ptrs(std::vector<GenericTensorAccessorR> const &a) {
-  return get<DataType::INT32>(a);
-}
-
-std::vector<int64_t const *>
-    get_int64_ptrs(std::vector<GenericTensorAccessorR> const &a) {
-  return get<DataType::INT64>(a);
-}
-
-std::vector<float const *>
-    get_float_ptrs(std::vector<GenericTensorAccessorR> const &a) {
-  return get<DataType::FLOAT>(a);
-}
-
-std::vector<double const *>
-    get_double_ptrs(std::vector<GenericTensorAccessorR> const &a) {
-  return get<DataType::DOUBLE>(a);
-}
-
-std::vector<half const *>
-    get_half_ptrs(std::vector<GenericTensorAccessorR> const &a) {
-  return get<DataType::HALF>(a);
-}
-
-GenericTensorAccessorR read_only_accessor_from_write_accessor(
-    GenericTensorAccessorW const &writable) {
-  return GenericTensorAccessorR{
-      writable.data_type, writable.shape, req<void const *>(writable.ptr)};
-}
-
-bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1,
-                              GenericTensorAccessorW const &acc2) {
-  return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type;
-}
-
-bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor,
-                             ArrayShape const &expected_shape,
-                             DataType const &expected_dtype) {
-  return accessor.shape == expected_shape &&
-         accessor.data_type == expected_dtype;
-}
-
-bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
-                             ArrayShape const &expected_shape,
-                             DataType const &expected_dtype) {
-  return accessor.shape == expected_shape &&
-         accessor.data_type == expected_dtype;
-}
-
-std::pair<ArrayShape, DataType>
-    get_shape_and_datatype(GenericTensorAccessorR const &accessor) {
-  return std::make_pair(accessor.shape, accessor.data_type);
-}
-
-std::pair<ArrayShape, DataType>
-    get_shape_and_datatype(GenericTensorAccessorW const &accessor) {
-  return std::make_pair(accessor.shape, accessor.data_type);
-}
-
-} // namespace FlexFlow
diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc
deleted file mode 100644
index 114f817215..0000000000
--- a/lib/kernels/src/allocation.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-#include "kernels/allocation.h"
-#include "op-attrs/tensor_shape.h"
-
-namespace FlexFlow {
-
-void *Allocator::allocate(size_t mem_size) {
-  return this->i_allocator->allocate(mem_size);
-}
-
-void Allocator::deallocate(void *ptr) {
-  this->i_allocator->deallocate(ptr);
-}
-
-GenericTensorAccessorW
-    Allocator::allocate_tensor(TensorShape const &tensor_shape) {
-  void *ptr =
-      this->allocate(get_size_in_bytes(tensor_shape).unwrap_nonnegative());
-  return {tensor_shape.data_type, ArrayShape{tensor_shape}, ptr};
-}
-
-} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/accessor.cc b/lib/kernels/src/kernels/accessor.cc
index b5042f77a0..409b7533f9 100644
--- a/lib/kernels/src/kernels/accessor.cc
+++ b/lib/kernels/src/kernels/accessor.cc
@@ -234,6 +234,11 @@ bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1,
   return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type;
 }
 
+bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1,
+                              GenericTensorAccessorW const &acc2) {
+  return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type;
+}
+
 bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
                              ArrayShape const &expected_shape,
                              DataType const &expected_dtype) {
@@ -241,9 +246,21 @@ bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
          accessor.data_type == expected_dtype;
 }
 
+bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor,
+                             ArrayShape const &expected_shape,
+                             DataType const &expected_dtype) {
+  return accessor.shape == expected_shape &&
+         accessor.data_type == expected_dtype;
+}
+
 std::pair<ArrayShape, DataType>
     get_shape_and_datatype(GenericTensorAccessorR const &accessor) {
   return std::make_pair(accessor.shape, accessor.data_type);
 }
 
+std::pair<ArrayShape, DataType>
+    get_shape_and_datatype(GenericTensorAccessorW const &accessor) {
+  return std::make_pair(accessor.shape, accessor.data_type);
+}
+
 } // namespace FlexFlow
diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc
index dc1303b8e0..7c619bb557 100644
--- a/lib/kernels/src/managed_per_device_ff_handle.cc
+++ b/lib/kernels/src/managed_per_device_ff_handle.cc
@@ -48,13 +48,25 @@ PerDeviceFFHandle const &ManagedPerDeviceFFHandle::raw_handle() const {
   return *handle;
 }
 
-ManagedPerDeviceFFHandle initialize_single_gpu_handle() {
-  return ManagedPerDeviceFFHandle(1, 0);
+ManagedPerDeviceFFHandle initialize_single_gpu_handle(size_t workSpaceSize, bool allowTensorOpMathConversion) {
+  return ManagedPerDeviceFFHandle{
+    /*num_ranks=*/1, 
+    /*my_rank=*/0,
+    /*workSpaceSize=*/workSpaceSize,
+    /*allowTensorOpMathConversion=*/allowTensorOpMathConversion,
+  };
 }
 
 ManagedPerDeviceFFHandle initialize_multi_gpu_handle(int num_ranks,
-                                                     int my_rank) {
-  return ManagedPerDeviceFFHandle(num_ranks, my_rank);
+                                                     int my_rank,
+                                                     size_t workSpaceSize,
+                                                     bool allowTensorOpMathConversion) {
+  return ManagedPerDeviceFFHandle{
+    /*num_ranks=*/num_ranks, 
+    /*my_rank=*/my_rank,
+    /*workSpaceSize=*/workSpaceSize,
+    /*allowTensorOpMathConversion=*/allowTensorOpMathConversion,
+  };
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/include/local-execution/allocated_tensors.struct.toml b/lib/local-execution/include/local-execution/allocated_tensors.struct.toml
index 09245097b4..33985b0d74 100644
--- a/lib/local-execution/include/local-execution/allocated_tensors.struct.toml
+++ b/lib/local-execution/include/local-execution/allocated_tensors.struct.toml
@@ -3,7 +3,6 @@ name = "AllocatedTensors"
 features = [
   "eq",
   "fmt",
-  "hash",
 ]
 
 includes = [
diff --git a/lib/local-execution/include/local-execution/local_task_argument_accessor.h b/lib/local-execution/include/local-execution/local_task_argument_accessor.h
index c46534330b..d95545d1cc 100644
--- a/lib/local-execution/include/local-execution/local_task_argument_accessor.h
+++ b/lib/local-execution/include/local-execution/local_task_argument_accessor.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H
 #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H
 
-#include "local-execution/task_argument_accessor.h"
+#include "task-spec/task_argument_accessor.h"
 #include "task-spec/slot_tensor_type_id.dtg.h"
 #include <unordered_map>
 #include <variant>
diff --git a/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml b/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml
index c34063af5d..bd59ec325d 100644
--- a/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml
+++ b/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml
@@ -3,7 +3,6 @@ name = "LocalTensorBacking"
 features = [
   "eq",
   "fmt",
-  "hash"
 ]
 
 includes = [
@@ -15,9 +14,7 @@ includes = [
 ]
 
 src_includes = [
-  "utils/hash/unordered_map.h",
   "utils/fmt/unordered_map.h",
-  "utils/hash/vector.h",
   "utils/fmt/vector.h",
 ]
 
diff --git a/lib/local-execution/include/local-execution/loss_functions.h b/lib/local-execution/include/local-execution/loss_functions.h
index c06908503a..d625088be4 100644
--- a/lib/local-execution/include/local-execution/loss_functions.h
+++ b/lib/local-execution/include/local-execution/loss_functions.h
@@ -16,7 +16,7 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_
 #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_
 
-#include "local-execution/task_impl_function.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/loss_functions.h"
 #include "pcg/tensor_guid_t.dtg.h"
 #include "task-spec/loss_tensor_t.dtg.h"
diff --git a/lib/local-execution/include/local-execution/loss_tensor_source.h b/lib/local-execution/include/local-execution/loss_tensor_source.h
index d9858cde40..b794207c7f 100644
--- a/lib/local-execution/include/local-execution/loss_tensor_source.h
+++ b/lib/local-execution/include/local-execution/loss_tensor_source.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_LOCAL_EXECUTION_LOSS_TENSOR_SOURCE_H
 
 #include "task-spec/loss_tensor_t.dtg.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 
 namespace FlexFlow {
 
@@ -12,7 +13,7 @@ struct LossTensorSource {
   loss_tensor_t new_loss_tensor();
 
 private:
-  static size_t next_available_loss_tensor_id;
+  static nonnegative_int next_available_loss_tensor_id;
 };
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h
index f6bd5a3ee9..7b08036059 100644
--- a/lib/local-execution/include/local-execution/optimizer.h
+++ b/lib/local-execution/include/local-execution/optimizer.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_
 #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_
 
-#include "local-execution/task_impl_function.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
 #include "pcg/optimizers/adam_optimizer_attrs.dtg.h"
 #include "pcg/optimizers/sgd_optimizer_attrs.dtg.h"
diff --git a/lib/local-execution/include/local-execution/task_registry.struct.toml b/lib/local-execution/include/local-execution/task_registry.struct.toml
index c3784b617f..f5daa62090 100644
--- a/lib/local-execution/include/local-execution/task_registry.struct.toml
+++ b/lib/local-execution/include/local-execution/task_registry.struct.toml
@@ -7,7 +7,7 @@ features = [
 ]
 
 includes = [
-  "local-execution/task_signature_impl.dtg.h",
+  "task-spec/task_signature_impl.dtg.h",
   "task-spec/task_id_t.dtg.h",
   "pcg/layer_guid_t.dtg.h",
 ]
diff --git a/lib/local-execution/src/allocated_tensors.cc b/lib/local-execution/src/allocated_tensors.cc
index 196da16ace..d400b4f815 100644
--- a/lib/local-execution/src/allocated_tensors.cc
+++ b/lib/local-execution/src/allocated_tensors.cc
@@ -35,7 +35,7 @@ bool are_allocated_forward_tensors_valid(
       if (!is_allocated_tensor_backing_valid(
               TensorTypeVariant{tensor_guid},
               allocated_tensors.tensor_type_backings,
-              ArrayShape{tensor_attrs.at(tensor_guid).shape})) {
+              array_shape_from_tensor_shape(tensor_attrs.at(tensor_guid).shape))) {
         return false;
       }
     } else {
@@ -59,7 +59,7 @@ bool are_allocated_gradient_tensors_valid(
       }
 
       ArrayShape tensor_guid_array_shape =
-          ArrayShape{tensor_attrs.at(tensor_to_grad.first).shape};
+          array_shape_from_tensor_shape(tensor_attrs.at(tensor_to_grad.first).shape);
       TensorTypeVariant gradient_tensor =
           TensorTypeVariant{tensor_to_grad.second};
       if (is_allocated_tensor_backing_valid(
@@ -101,7 +101,7 @@ bool are_allocated_optimizer_tensors_valid(
       }
 
       ArrayShape tensor_guid_array_shape =
-          ArrayShape{tensor_attrs.at(tensor_to_optimizers.first).shape};
+          array_shape_from_tensor_shape(tensor_attrs.at(tensor_to_optimizers.first).shape);
       for (optimizer_tensor_t const &optimizer_tensor :
            tensor_to_optimizers.second) {
         if (is_allocated_tensor_backing_valid(
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index d508c34210..4b5ee0b782 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -1,7 +1,7 @@
 #include "local-execution/local_training_backing.h"
 #include "local-execution/loss_functions.h"
 #include "local-execution/optimizer.h"
-#include "local-execution/task_signature_impl.h"
+#include "task-spec/task_signature_impl.h"
 #include "local-execution/unallocated_tensors.h"
 #include "pcg/computation_graph.h"
 #include "pcg/optimizer_attrs.h"
diff --git a/lib/local-execution/src/loss_tensor_source.cc b/lib/local-execution/src/loss_tensor_source.cc
index da1efa6b85..f5ce639087 100644
--- a/lib/local-execution/src/loss_tensor_source.cc
+++ b/lib/local-execution/src/loss_tensor_source.cc
@@ -2,7 +2,7 @@
 
 namespace FlexFlow {
 
-size_t LossTensorSource::next_available_loss_tensor_id = 0;
+nonnegative_int LossTensorSource::next_available_loss_tensor_id = 0_n;
 
 LossTensorSource::LossTensorSource() {}
 
diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc
index 2787342a5f..0acc3d865d 100644
--- a/lib/local-execution/src/task_registry.cc
+++ b/lib/local-execution/src/task_registry.cc
@@ -1,5 +1,5 @@
 #include "local-execution/task_registry.h"
-#include "local-execution/task_signature_impl.h"
+#include "task-spec/task_signature_impl.h"
 #include "pcg/computation_graph.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/test/CMakeLists.txt b/lib/local-execution/test/CMakeLists.txt
index a973c6967b..0e79376575 100644
--- a/lib/local-execution/test/CMakeLists.txt
+++ b/lib/local-execution/test/CMakeLists.txt
@@ -11,11 +11,6 @@ ff_add_test_executable(
     local-execution
     kernels
     op-attrs
+    task-spec
 )
 
-set(FF_TEST_EXEC_NAME "local-execution-tests")
-add_custom_command(
-  TARGET ${FF_TEST_EXEC_NAME} POST_BUILD
-  COMMAND ${CMAKE_COMMAND} -DFF_TEST_EXEC_NAME=${FF_TEST_EXEC_NAME} -P ${CMAKE_CURRENT_LIST_DIR}/modify_test_commands.cmake
-  DEPENDS ${FF_TEST_EXEC_NAME}
-)
diff --git a/lib/local-execution/test/modify_test_commands.cmake b/lib/local-execution/test/modify_test_commands.cmake
deleted file mode 100644
index 6494ae2d78..0000000000
--- a/lib/local-execution/test/modify_test_commands.cmake
+++ /dev/null
@@ -1,21 +0,0 @@
-# modify_test_commands.cmake
-
-file(GLOB ctest_tests_files "${CMAKE_CURRENT_BINARY_DIR}/${FF_TEST_EXEC_NAME}_tests-*.cmake")
-
-foreach(ctest_tests_file IN LISTS ctest_tests_files)
-  file(READ "${ctest_tests_file}" content)
-
-  # add nix run prefix
-  string(REGEX REPLACE 
-    "add_test\\([ \t\r\n]*\\[==\\[([^]]+)\\]==\\][ \t\r\n]+([^ ]+)[ \t\r\n]+\\[==\\[([^]]+)\\]==\\]\\)" 
-    "add_test( [==[\\1]==] nixGL -- \\2 [==[\\3]==])" 
-    content "${content}")
-
-  # add environment
-  # string(REGEX REPLACE 
-  #   "set_tests_properties\\([ \t\r\n]*\\[==\\[([^]]+)\\]==\\][ \t\r\n]+PROPERTIES[ \t\r\n]+([^)]+)\\)" 
-  #   "set_tests_properties( [==[\\1]==] PROPERTIES \\2 ENVIRONMENT \"NIXPKGS_ALLOW_UNFREE=1\")" 
-  #   content "${content}")
-
-  file(WRITE "${ctest_tests_file}" "${content}")
-endforeach()
diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc
index 5791a94cbb..b527430d67 100644
--- a/lib/local-execution/test/src/test_e2e.cc
+++ b/lib/local-execution/test/src/test_e2e.cc
@@ -23,8 +23,8 @@ bool did_loss_decrease(float *first_epoch, float *last_epoch, int batch_size) {
   return true;
 }
 
-TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("E2ETest") {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("LocalBackend e2e Training") {
     // initialize runtime
     ManagedFFStream managed_stream{};
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
@@ -47,7 +47,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     GenericTensorAccessorW label_tensor_backing =
         allocator.allocate_tensor(output_tensor_shape);
     AllocatedTensors allocated_tensors = AllocatedTensors{
-        {{TensorTypeVariant{label_tensor}, label_tensor_backing}}, {}, {}};
+        /*tensor_type_backings=*/{
+          {TensorTypeVariant{label_tensor}, label_tensor_backing},
+        }, 
+        /*gradient_mapping=*/{}, 
+        /*optimizer_mapping*/{},
+    };
 
     // construct computation graph
     ComputationGraph computation_graph = make_empty_computation_graph();
diff --git a/lib/local-execution/test/src/test_local_task_arg_accessor.cc b/lib/local-execution/test/src/test_local_task_arg_accessor.cc
index 9966ca5c10..29b3b432cd 100644
--- a/lib/local-execution/test/src/test_local_task_arg_accessor.cc
+++ b/lib/local-execution/test/src/test_local_task_arg_accessor.cc
@@ -1,7 +1,7 @@
 #include "doctest/doctest.h"
 #include "kernels/local_cpu_allocator.h"
 #include "local-execution/local_task_argument_accessor.h"
-#include "local-execution/task_signature_impl.h"
+#include "task-spec/task_signature_impl.h"
 #include "utils/fmt/variant.h"
 
 using namespace ::FlexFlow;
diff --git a/lib/local-execution/test/src/test_task_registry.cc b/lib/local-execution/test/src/test_task_registry.cc
index 16877b0e09..c87fd3a899 100644
--- a/lib/local-execution/test/src/test_task_registry.cc
+++ b/lib/local-execution/test/src/test_task_registry.cc
@@ -1,7 +1,7 @@
 #include "doctest/doctest.h"
 #include "kernels/local_cuda_allocator.h"
 #include "local-execution/local_cost_estimator.h"
-#include "local-execution/task_signature_impl.h"
+#include "task-spec/task_signature_impl.h"
 #include "pcg/computation_graph_builder.h"
 #include "utils/fmt/optional.h"
 #include "utils/fmt/unordered_map.h"
diff --git a/lib/pcg/include/pcg/metric_attrs.h b/lib/pcg/include/pcg/metric_attrs.h
index 343c2154dd..21f9115a67 100644
--- a/lib/pcg/include/pcg/metric_attrs.h
+++ b/lib/pcg/include/pcg/metric_attrs.h
@@ -1,7 +1,7 @@
 #ifndef _FF_METRICS_H_
 #define _FF_METRICS_H_
 
-#include "op-attrs/ops/loss_functions/loss_functions.h"
+#include "op-attrs/ops/loss_functions/loss_function.dtg.h"
 #include "pcg/metric.dtg.h"
 #include "utils/fmt.h"
 #include <unordered_set>
diff --git a/lib/pcg/src/pcg/metric_attrs.cc b/lib/pcg/src/pcg/metric_attrs.cc
index 9a93e75350..5357775149 100644
--- a/lib/pcg/src/pcg/metric_attrs.cc
+++ b/lib/pcg/src/pcg/metric_attrs.cc
@@ -1,4 +1,5 @@
 #include "pcg/metric_attrs.h"
+#include <libassert/assert.hpp>
 
 namespace FlexFlow {
 MetricsAttrs::MetricsAttrs(LossFunction _loss_type,
@@ -29,8 +30,7 @@ MetricsAttrs::MetricsAttrs(LossFunction _loss_type,
         measure_mean_absolute_error = true;
         continue;
       default:
-        throw mk_runtime_error(fmt::format(
-            "Initializing MetricsAttrs with unrecogonized metrics type {}", m));
+        PANIC("Initializing MetricsAttrs with unrecognized metrics type {}", m);
     }
   }
 }
diff --git a/lib/local-execution/include/local-execution/fwd_bwd_op_task_impl_function.h b/lib/task-spec/include/task-spec/fwd_bwd_op_task_impl_function.h
similarity index 79%
rename from lib/local-execution/include/local-execution/fwd_bwd_op_task_impl_function.h
rename to lib/task-spec/include/task-spec/fwd_bwd_op_task_impl_function.h
index cc82291f6a..3620ff87cb 100644
--- a/lib/local-execution/include/local-execution/fwd_bwd_op_task_impl_function.h
+++ b/lib/task-spec/include/task-spec/fwd_bwd_op_task_impl_function.h
@@ -1,7 +1,7 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_FWD_BWD_TASK_IMPL_FUNCTION_H
-#define _FLEXFLOW_LOCAL_EXECUTION_FWD_BWD_TASK_IMPL_FUNCTION_H
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_FWD_BWD_OP_TASK_IMPL_FUNCTION_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_FWD_BWD_OP_TASK_IMPL_FUNCTION_H
 
-#include "local-execution/task_argument_accessor.h"
+#include "task-spec/task_argument_accessor.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/generic_task_impl_function.h b/lib/task-spec/include/task-spec/generic_task_impl_function.h
similarity index 80%
rename from lib/local-execution/include/local-execution/generic_task_impl_function.h
rename to lib/task-spec/include/task-spec/generic_task_impl_function.h
index 9ce22ecf54..b02f4d6beb 100644
--- a/lib/local-execution/include/local-execution/generic_task_impl_function.h
+++ b/lib/task-spec/include/task-spec/generic_task_impl_function.h
@@ -1,7 +1,7 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_GENERIC_TASK_IMPL_FUNCTION_H
-#define _FLEXFLOW_LOCAL_EXECUTION_GENERIC_TASK_IMPL_FUNCTION_H
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_GENERIC_TASK_IMPL_FUNCTION_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_GENERIC_TASK_IMPL_FUNCTION_H
 
-#include "local-execution/task_argument_accessor.h"
+#include "task-spec/task_argument_accessor.h"
 #include "task-spec/device_specific_device_states.dtg.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/include/local-execution/init_op_task_impl_function.h b/lib/task-spec/include/task-spec/init_op_task_impl_function.h
similarity index 81%
rename from lib/local-execution/include/local-execution/init_op_task_impl_function.h
rename to lib/task-spec/include/task-spec/init_op_task_impl_function.h
index 0481e31a5f..f98e972df8 100644
--- a/lib/local-execution/include/local-execution/init_op_task_impl_function.h
+++ b/lib/task-spec/include/task-spec/init_op_task_impl_function.h
@@ -1,7 +1,7 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_INIT_TASK_IMPL_FUNCTION_H
-#define _FLEXFLOW_LOCAL_EXECUTION_INIT_TASK_IMPL_FUNCTION_H
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_INIT_OP_TASK_IMPL_FUNCTION_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_INIT_OP_TASK_IMPL_FUNCTION_H
 
-#include "local-execution/task_argument_accessor.h"
+#include "task-spec/task_argument_accessor.h"
 #include "task-spec/device_specific_device_states.dtg.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/include/local-execution/itask_argument_accessor.h b/lib/task-spec/include/task-spec/itask_argument_accessor.h
similarity index 82%
rename from lib/local-execution/include/local-execution/itask_argument_accessor.h
rename to lib/task-spec/include/task-spec/itask_argument_accessor.h
index 24b3b3a37f..1424b09b84 100644
--- a/lib/local-execution/include/local-execution/itask_argument_accessor.h
+++ b/lib/task-spec/include/task-spec/itask_argument_accessor.h
@@ -1,8 +1,8 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_ITASK_ARGUMENT_ACCESSOR_H
-#define _FLEXFLOW_LOCAL_EXECUTION_ITASK_ARGUMENT_ACCESSOR_H
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_ITASK_ARGUMENT_ACCESSOR_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_ITASK_ARGUMENT_ACCESSOR_H
 
 #include "kernels/allocation.h"
-#include "local-execution/privilege_tensor_accessor.h"
+#include "task-spec/privilege_tensor_accessor.h"
 #include "task-spec/concrete_arg.h"
 #include "task-spec/op_task_signature.h"
 #include "task-spec/tensor_type.dtg.h"
diff --git a/lib/task-spec/include/task-spec/loss_tensor_t.struct.toml b/lib/task-spec/include/task-spec/loss_tensor_t.struct.toml
index 0d0d428a1b..405385069f 100644
--- a/lib/task-spec/include/task-spec/loss_tensor_t.struct.toml
+++ b/lib/task-spec/include/task-spec/loss_tensor_t.struct.toml
@@ -7,7 +7,10 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h"
+]
 
 [[fields]]
 name = "raw_index"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/local-execution/include/local-execution/ops/attention.h b/lib/task-spec/include/task-spec/ops/attention.h
similarity index 79%
rename from lib/local-execution/include/local-execution/ops/attention.h
rename to lib/task-spec/include/task-spec/ops/attention.h
index bf5385f609..9b0179eeac 100644
--- a/lib/local-execution/include/local-execution/ops/attention.h
+++ b/lib/task-spec/include/task-spec/ops/attention.h
@@ -1,7 +1,7 @@
-#ifndef _FLEXFLOW_ATTENTION_H
-#define _FLEXFLOW_ATTENTION_H
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_ATTENTION_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_ATTENTION_H
 
-#include "local-execution/task_impl_function.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/attention.h"
 #include "task-spec/op_task_invocation.h"
 
diff --git a/lib/local-execution/include/local-execution/ops/batch_matmul.h b/lib/task-spec/include/task-spec/ops/batch_matmul.h
similarity index 75%
rename from lib/local-execution/include/local-execution/ops/batch_matmul.h
rename to lib/task-spec/include/task-spec/ops/batch_matmul.h
index 64d220ab66..e0dc01d3f1 100644
--- a/lib/local-execution/include/local-execution/ops/batch_matmul.h
+++ b/lib/task-spec/include/task-spec/ops/batch_matmul.h
@@ -1,7 +1,7 @@
-#ifndef _FLEXFLOW_BATCH_MATMUL_H
-#define _FLEXFLOW_BATCH_MATMUL_H
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_BATCH_MATMUL_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_BATCH_MATMUL_H
 
-#include "local-execution/task_impl_function.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/batch_matmul_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
 #include "task-spec/op_task_signature.h"
diff --git a/lib/local-execution/include/local-execution/ops/batch_norm.h b/lib/task-spec/include/task-spec/ops/batch_norm.h
similarity index 93%
rename from lib/local-execution/include/local-execution/ops/batch_norm.h
rename to lib/task-spec/include/task-spec/ops/batch_norm.h
index 85a7190ce1..081b60318f 100644
--- a/lib/local-execution/include/local-execution/ops/batch_norm.h
+++ b/lib/task-spec/include/task-spec/ops/batch_norm.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_BATCH_NORM_H
 #define _FLEXFLOW_BATCH_NORM_H
 
-#include "local-execution/task_impl_function.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/batch_norm_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
 
diff --git a/lib/local-execution/include/local-execution/ops/cast.h b/lib/task-spec/include/task-spec/ops/cast.h
similarity index 95%
rename from lib/local-execution/include/local-execution/ops/cast.h
rename to lib/task-spec/include/task-spec/ops/cast.h
index 6a27ad267a..990624b0e3 100644
--- a/lib/local-execution/include/local-execution/ops/cast.h
+++ b/lib/task-spec/include/task-spec/ops/cast.h
@@ -15,7 +15,7 @@
 #ifndef _FLEXFLOW_CAST_H
 #define _FLEXFLOW_CAST_H
 
-#include "local-execution/task_impl_function.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/cast_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
 
diff --git a/lib/local-execution/include/local-execution/ops/combine.h b/lib/task-spec/include/task-spec/ops/combine.h
similarity index 91%
rename from lib/local-execution/include/local-execution/ops/combine.h
rename to lib/task-spec/include/task-spec/ops/combine.h
index 00e9cbed2c..be16379f36 100644
--- a/lib/local-execution/include/local-execution/ops/combine.h
+++ b/lib/task-spec/include/task-spec/ops/combine.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_COMBINE_H
 #define _FLEXFLOW_COMBINE_H
 
-#include "local-execution/task_impl_function.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/combine_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
 
diff --git a/lib/local-execution/include/local-execution/ops/concat.h b/lib/task-spec/include/task-spec/ops/concat.h
similarity index 91%
rename from lib/local-execution/include/local-execution/ops/concat.h
rename to lib/task-spec/include/task-spec/ops/concat.h
index c46164e417..6c7adf76ea 100644
--- a/lib/local-execution/include/local-execution/ops/concat.h
+++ b/lib/task-spec/include/task-spec/ops/concat.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_CONCAT_H
 #define _FLEXFLOW_CONCAT_H
 
-#include "local-execution/task_impl_function.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/concat_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
 
diff --git a/lib/local-execution/include/local-execution/ops/conv_2d.h b/lib/task-spec/include/task-spec/ops/conv_2d.h
similarity index 92%
rename from lib/local-execution/include/local-execution/ops/conv_2d.h
rename to lib/task-spec/include/task-spec/ops/conv_2d.h
index f3bb34ffeb..b7fda64961 100644
--- a/lib/local-execution/include/local-execution/ops/conv_2d.h
+++ b/lib/task-spec/include/task-spec/ops/conv_2d.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_CONV_2D_H
 #define _FLEXFLOW_CONV_2D_H
 
-#include "local-execution/task_impl_function.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/conv_2d_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
 
diff --git a/lib/local-execution/include/local-execution/ops/dropout.h b/lib/task-spec/include/task-spec/ops/dropout.h
similarity index 93%
rename from lib/local-execution/include/local-execution/ops/dropout.h
rename to lib/task-spec/include/task-spec/ops/dropout.h
index bd7b426c6b..1801b63123 100644
--- a/lib/local-execution/include/local-execution/ops/dropout.h
+++ b/lib/task-spec/include/task-spec/ops/dropout.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_DROPOUT_H
 #define _FLEXFLOW_DROPOUT_H
 
-#include "local-execution/task_impl_function.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/dropout_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
 #include "task-spec/task_id_t.dtg.h"
diff --git a/lib/local-execution/include/local-execution/ops/element_binary.h b/lib/task-spec/include/task-spec/ops/element_binary.h
similarity index 73%
rename from lib/local-execution/include/local-execution/ops/element_binary.h
rename to lib/task-spec/include/task-spec/ops/element_binary.h
index 4e0bb46e47..57af54522d 100644
--- a/lib/local-execution/include/local-execution/ops/element_binary.h
+++ b/lib/task-spec/include/task-spec/ops/element_binary.h
@@ -1,8 +1,8 @@
-#ifndef _FLEXFLOW_ELEMENT_BINARY_H
-#define _FLEXFLOW_ELEMENT_BINARY_H
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_ELEMENT_BINARY_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_ELEMENT_BINARY_H
 
-#include "local-execution/task_impl_function.dtg.h"
-#include "local-execution/task_signature_impl.h"
+#include "task-spec/task_impl_function.dtg.h"
+#include "task-spec/task_signature_impl.h"
 #include "op-attrs/ops/element_binary_attrs.dtg.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/include/local-execution/ops/element_unary.h b/lib/task-spec/include/task-spec/ops/element_unary.h
similarity index 93%
rename from lib/local-execution/include/local-execution/ops/element_unary.h
rename to lib/task-spec/include/task-spec/ops/element_unary.h
index 9900668d6c..f6dcd41455 100644
--- a/lib/local-execution/include/local-execution/ops/element_unary.h
+++ b/lib/task-spec/include/task-spec/ops/element_unary.h
@@ -1,7 +1,7 @@
 #ifndef _ELEMENT_UNARY_H
 #define _ELEMENT_UNARY_H
 
-#include "local-execution/task_impl_function.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/element_unary_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
 
diff --git a/lib/local-execution/include/local-execution/ops/embedding.h b/lib/task-spec/include/task-spec/ops/embedding.h
similarity index 91%
rename from lib/local-execution/include/local-execution/ops/embedding.h
rename to lib/task-spec/include/task-spec/ops/embedding.h
index b998aef53e..3a80d38398 100644
--- a/lib/local-execution/include/local-execution/ops/embedding.h
+++ b/lib/task-spec/include/task-spec/ops/embedding.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_EMBEDDING_H
 #define _FLEXFLOW_EMBEDDING_H
 
-#include "local-execution/task_impl_function.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/embedding_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
 
diff --git a/lib/local-execution/include/local-execution/ops/flat.h b/lib/task-spec/include/task-spec/ops/flat.h
similarity index 90%
rename from lib/local-execution/include/local-execution/ops/flat.h
rename to lib/task-spec/include/task-spec/ops/flat.h
index 95afb98340..6ac72ccd6b 100644
--- a/lib/local-execution/include/local-execution/ops/flat.h
+++ b/lib/task-spec/include/task-spec/ops/flat.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_FLAT_H
 #define _FLEXFLOW_FLAT_H
 
-#include "local-execution/task_impl_function.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/flat_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
 
diff --git a/lib/local-execution/include/local-execution/ops/gather.h b/lib/task-spec/include/task-spec/ops/gather.h
similarity index 92%
rename from lib/local-execution/include/local-execution/ops/gather.h
rename to lib/task-spec/include/task-spec/ops/gather.h
index 5569a94728..c5ccc4ccdb 100644
--- a/lib/local-execution/include/local-execution/ops/gather.h
+++ b/lib/task-spec/include/task-spec/ops/gather.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_GATHER_H
 #define _FLEXFLOW_GATHER_H
 
-#include "local-execution/task_impl_function.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/gather_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
 
diff --git a/lib/local-execution/include/local-execution/ops/input.h b/lib/task-spec/include/task-spec/ops/input.h
similarity index 100%
rename from lib/local-execution/include/local-execution/ops/input.h
rename to lib/task-spec/include/task-spec/ops/input.h
diff --git a/lib/local-execution/include/local-execution/ops/layer_norm.h b/lib/task-spec/include/task-spec/ops/layer_norm.h
similarity index 93%
rename from lib/local-execution/include/local-execution/ops/layer_norm.h
rename to lib/task-spec/include/task-spec/ops/layer_norm.h
index e4a15caac2..81af0c360f 100644
--- a/lib/local-execution/include/local-execution/ops/layer_norm.h
+++ b/lib/task-spec/include/task-spec/ops/layer_norm.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_RUNTIME_SRC_OPS_LAYER_NORM_H
 #define _FLEXFLOW_RUNTIME_SRC_OPS_LAYER_NORM_H
 
-#include "local-execution/task_impl_function.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/layer_norm_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
 
diff --git a/lib/local-execution/include/local-execution/ops/linear.h b/lib/task-spec/include/task-spec/ops/linear.h
similarity index 92%
rename from lib/local-execution/include/local-execution/ops/linear.h
rename to lib/task-spec/include/task-spec/ops/linear.h
index d58d876865..69197fd627 100644
--- a/lib/local-execution/include/local-execution/ops/linear.h
+++ b/lib/task-spec/include/task-spec/ops/linear.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LINEAR_H
 #define _FLEXFLOW_LINEAR_H
 
-#include "local-execution/task_impl_function.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/linear_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
 
diff --git a/lib/local-execution/include/local-execution/ops/noop.h b/lib/task-spec/include/task-spec/ops/noop.h
similarity index 100%
rename from lib/local-execution/include/local-execution/ops/noop.h
rename to lib/task-spec/include/task-spec/ops/noop.h
diff --git a/lib/local-execution/include/local-execution/ops/parallel_op.h b/lib/task-spec/include/task-spec/ops/parallel_op.h
similarity index 100%
rename from lib/local-execution/include/local-execution/ops/parallel_op.h
rename to lib/task-spec/include/task-spec/ops/parallel_op.h
diff --git a/lib/local-execution/include/local-execution/ops/pool_2d.h b/lib/task-spec/include/task-spec/ops/pool_2d.h
similarity index 92%
rename from lib/local-execution/include/local-execution/ops/pool_2d.h
rename to lib/task-spec/include/task-spec/ops/pool_2d.h
index 7d0ec44bd7..a3601e8800 100644
--- a/lib/local-execution/include/local-execution/ops/pool_2d.h
+++ b/lib/task-spec/include/task-spec/ops/pool_2d.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_POOL_2D_H
 #define _FLEXFLOW_POOL_2D_H
 
-#include "local-execution/task_impl_function.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/pool_2d_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
 
diff --git a/lib/local-execution/include/local-execution/ops/reduce.h b/lib/task-spec/include/task-spec/ops/reduce.h
similarity index 93%
rename from lib/local-execution/include/local-execution/ops/reduce.h
rename to lib/task-spec/include/task-spec/ops/reduce.h
index 5c6d4be338..e44c0f283f 100644
--- a/lib/local-execution/include/local-execution/ops/reduce.h
+++ b/lib/task-spec/include/task-spec/ops/reduce.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H
 #define _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H
 
-#include "local-execution/task_impl_function.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/reduce_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
 
diff --git a/lib/local-execution/include/local-execution/ops/reduction.h b/lib/task-spec/include/task-spec/ops/reduction.h
similarity index 92%
rename from lib/local-execution/include/local-execution/ops/reduction.h
rename to lib/task-spec/include/task-spec/ops/reduction.h
index 7475d3aeb4..cba90c37bb 100644
--- a/lib/local-execution/include/local-execution/ops/reduction.h
+++ b/lib/task-spec/include/task-spec/ops/reduction.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_REDUCTION_H
 #define _FLEXFLOW_REDUCTION_H
 
-#include "local-execution/task_impl_function.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/reduction_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
 
diff --git a/lib/local-execution/include/local-execution/ops/repartition.h b/lib/task-spec/include/task-spec/ops/repartition.h
similarity index 93%
rename from lib/local-execution/include/local-execution/ops/repartition.h
rename to lib/task-spec/include/task-spec/ops/repartition.h
index 08ecdafcf2..f43cf13179 100644
--- a/lib/local-execution/include/local-execution/ops/repartition.h
+++ b/lib/task-spec/include/task-spec/ops/repartition.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_PARTITION_H
 #define _FLEXFLOW_PARTITION_H
 
-#include "local-execution/task_impl_function.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/repartition_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
 
diff --git a/lib/local-execution/include/local-execution/ops/replicate.h b/lib/task-spec/include/task-spec/ops/replicate.h
similarity index 91%
rename from lib/local-execution/include/local-execution/ops/replicate.h
rename to lib/task-spec/include/task-spec/ops/replicate.h
index b827b9c272..0086dad741 100644
--- a/lib/local-execution/include/local-execution/ops/replicate.h
+++ b/lib/task-spec/include/task-spec/ops/replicate.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_REPLICATE_H
 #define _FLEXFLOW_REPLICATE_H
 
-#include "local-execution/task_impl_function.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/replicate_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
 
diff --git a/lib/local-execution/include/local-execution/ops/reshape.h b/lib/task-spec/include/task-spec/ops/reshape.h
similarity index 92%
rename from lib/local-execution/include/local-execution/ops/reshape.h
rename to lib/task-spec/include/task-spec/ops/reshape.h
index ed7e6e9e31..f192d83b9a 100644
--- a/lib/local-execution/include/local-execution/ops/reshape.h
+++ b/lib/task-spec/include/task-spec/ops/reshape.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_RESHAPE_H
 #define _FLEXFLOW_RESHAPE_H
 
-#include "local-execution/task_impl_function.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/reshape_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
 
diff --git a/lib/local-execution/include/local-execution/ops/reverse.h b/lib/task-spec/include/task-spec/ops/reverse.h
similarity index 91%
rename from lib/local-execution/include/local-execution/ops/reverse.h
rename to lib/task-spec/include/task-spec/ops/reverse.h
index dd0e89ecad..bb123b63f5 100644
--- a/lib/local-execution/include/local-execution/ops/reverse.h
+++ b/lib/task-spec/include/task-spec/ops/reverse.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_REVERSE_H_
 #define _FLEXFLOW_REVERSE_H_
 
-#include "local-execution/task_impl_function.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/reverse_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
 
diff --git a/lib/local-execution/include/local-execution/ops/softmax.h b/lib/task-spec/include/task-spec/ops/softmax.h
similarity index 78%
rename from lib/local-execution/include/local-execution/ops/softmax.h
rename to lib/task-spec/include/task-spec/ops/softmax.h
index 294d948b42..528dd5da0b 100644
--- a/lib/local-execution/include/local-execution/ops/softmax.h
+++ b/lib/task-spec/include/task-spec/ops/softmax.h
@@ -1,7 +1,7 @@
-#ifndef _FLEXFLOW_SOFTMAX_H
-#define _FLEXFLOW_SOFTMAX_H
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_SOFTMAX_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_SOFTMAX_H
 
-#include "local-execution/task_impl_function.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/softmax_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
 
diff --git a/lib/local-execution/include/local-execution/ops/split.h b/lib/task-spec/include/task-spec/ops/split.h
similarity index 90%
rename from lib/local-execution/include/local-execution/ops/split.h
rename to lib/task-spec/include/task-spec/ops/split.h
index 49cd7cfc7b..ed92f2925e 100644
--- a/lib/local-execution/include/local-execution/ops/split.h
+++ b/lib/task-spec/include/task-spec/ops/split.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_SPLIT_H
 #define _FLEXFLOW_SPLIT_H
 
-#include "local-execution/task_impl_function.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/split_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
 
diff --git a/lib/local-execution/include/local-execution/ops/topk.h b/lib/task-spec/include/task-spec/ops/topk.h
similarity index 92%
rename from lib/local-execution/include/local-execution/ops/topk.h
rename to lib/task-spec/include/task-spec/ops/topk.h
index aeded512cd..8afe98d568 100644
--- a/lib/local-execution/include/local-execution/ops/topk.h
+++ b/lib/task-spec/include/task-spec/ops/topk.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_TOPK_H_
 #define _FLEXFLOW_TOPK_H_
 
-#include "local-execution/task_impl_function.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/topk_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
 
diff --git a/lib/local-execution/include/local-execution/ops/transpose.h b/lib/task-spec/include/task-spec/ops/transpose.h
similarity index 91%
rename from lib/local-execution/include/local-execution/ops/transpose.h
rename to lib/task-spec/include/task-spec/ops/transpose.h
index 2c7b5fb3bc..dec29f4b36 100644
--- a/lib/local-execution/include/local-execution/ops/transpose.h
+++ b/lib/task-spec/include/task-spec/ops/transpose.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_TRANSPOSE_H_
 #define _FLEXFLOW_TRANSPOSE_H_
 
-#include "local-execution/task_impl_function.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/transpose_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
 
diff --git a/lib/local-execution/include/local-execution/ops/weight.h b/lib/task-spec/include/task-spec/ops/weight.h
similarity index 100%
rename from lib/local-execution/include/local-execution/ops/weight.h
rename to lib/task-spec/include/task-spec/ops/weight.h
diff --git a/lib/local-execution/include/local-execution/permissions.h b/lib/task-spec/include/task-spec/permissions.h
similarity index 90%
rename from lib/local-execution/include/local-execution/permissions.h
rename to lib/task-spec/include/task-spec/permissions.h
index f34969f233..d1ae5fc349 100644
--- a/lib/local-execution/include/local-execution/permissions.h
+++ b/lib/task-spec/include/task-spec/permissions.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_PERMISSION_H
-#define _FLEXFLOW_LOCAL_EXECUTION_PERMISSION_H
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_PERMISSIONS_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_PERMISSIONS_H
 
 #include "utils/exception.h"
 #include "utils/fmt.h"
diff --git a/lib/local-execution/include/local-execution/privilege_tensor_accessor.h b/lib/task-spec/include/task-spec/privilege_tensor_accessor.h
similarity index 81%
rename from lib/local-execution/include/local-execution/privilege_tensor_accessor.h
rename to lib/task-spec/include/task-spec/privilege_tensor_accessor.h
index aeae3c2e41..171b0fcd39 100644
--- a/lib/local-execution/include/local-execution/privilege_tensor_accessor.h
+++ b/lib/task-spec/include/task-spec/privilege_tensor_accessor.h
@@ -1,8 +1,8 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_PRIVILEGE_TENSOR_ACCESSOR_H
-#define _FLEXFLOW_LOCAL_EXECUTION_PRIVILEGE_TENSOR_ACCESSOR_H
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_PRIVILEGE_TENSOR_ACCESSOR_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_PRIVILEGE_TENSOR_ACCESSOR_H
 
 #include "kernels/accessor.h"
-#include "local-execution/permissions.h"
+#include "task-spec/permissions.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/task-spec/include/task-spec/task_argument_accessor.h
similarity index 96%
rename from lib/local-execution/include/local-execution/task_argument_accessor.h
rename to lib/task-spec/include/task-spec/task_argument_accessor.h
index 499b5ff7d6..2cac3a5dd8 100644
--- a/lib/local-execution/include/local-execution/task_argument_accessor.h
+++ b/lib/task-spec/include/task-spec/task_argument_accessor.h
@@ -1,7 +1,7 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H
-#define _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TASK_ARGUMENT_ACCESSOR_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TASK_ARGUMENT_ACCESSOR_H
 
-#include "local-execution/itask_argument_accessor.h"
+#include "task-spec/itask_argument_accessor.h"
 #include "task-spec/device_specific.h"
 #include "task-spec/per_device_op_state.dtg.h"
 
diff --git a/lib/local-execution/include/local-execution/task_impl_function.variant.toml b/lib/task-spec/include/task-spec/task_impl_function.variant.toml
similarity index 72%
rename from lib/local-execution/include/local-execution/task_impl_function.variant.toml
rename to lib/task-spec/include/task-spec/task_impl_function.variant.toml
index 48cab9eb01..74347a3290 100644
--- a/lib/local-execution/include/local-execution/task_impl_function.variant.toml
+++ b/lib/task-spec/include/task-spec/task_impl_function.variant.toml
@@ -8,9 +8,9 @@ features = [
 ]
 
 includes = [
-  "local-execution/init_op_task_impl_function.h",
-  "local-execution/fwd_bwd_op_task_impl_function.h",
-  "local-execution/generic_task_impl_function.h",
+  "task-spec/init_op_task_impl_function.h",
+  "task-spec/fwd_bwd_op_task_impl_function.h",
+  "task-spec/generic_task_impl_function.h",
 ]
 
 [[values]]
diff --git a/lib/local-execution/include/local-execution/task_signature_impl.h b/lib/task-spec/include/task-spec/task_signature_impl.h
similarity index 71%
rename from lib/local-execution/include/local-execution/task_signature_impl.h
rename to lib/task-spec/include/task-spec/task_signature_impl.h
index 613a173f25..ee093c7d23 100644
--- a/lib/local-execution/include/local-execution/task_signature_impl.h
+++ b/lib/task-spec/include/task-spec/task_signature_impl.h
@@ -1,7 +1,7 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_IMPL_H
-#define _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_IMPL_H
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TASK_SIGNATURE_IMPL_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TASK_SIGNATURE_IMPL_H
 
-#include "local-execution/task_signature_impl.dtg.h"
+#include "task-spec/task_signature_impl.dtg.h"
 #include "op-attrs/computation_graph_op_attrs.h"
 #include "task-spec/op_task_invocation.h"
 #include "task-spec/task_id_t.dtg.h"
diff --git a/lib/local-execution/include/local-execution/task_signature_impl.struct.toml b/lib/task-spec/include/task-spec/task_signature_impl.struct.toml
similarity index 86%
rename from lib/local-execution/include/local-execution/task_signature_impl.struct.toml
rename to lib/task-spec/include/task-spec/task_signature_impl.struct.toml
index 78064203ec..574f11a084 100644
--- a/lib/local-execution/include/local-execution/task_signature_impl.struct.toml
+++ b/lib/task-spec/include/task-spec/task_signature_impl.struct.toml
@@ -7,7 +7,7 @@ features = [
 ]
 
 includes = [
-  "local-execution/task_impl_function.dtg.h",
+  "task-spec/task_impl_function.dtg.h",
   "task-spec/op_task_signature.h",
 ]
 
diff --git a/lib/task-spec/src/concrete_arg.cc b/lib/task-spec/src/task-spec/concrete_arg.cc
similarity index 100%
rename from lib/task-spec/src/concrete_arg.cc
rename to lib/task-spec/src/task-spec/concrete_arg.cc
diff --git a/lib/local-execution/src/fwd_bwd_op_task_impl_function.cc b/lib/task-spec/src/task-spec/fwd_bwd_op_task_impl_function.cc
similarity index 96%
rename from lib/local-execution/src/fwd_bwd_op_task_impl_function.cc
rename to lib/task-spec/src/task-spec/fwd_bwd_op_task_impl_function.cc
index 308dbfd3ae..3450b5d268 100644
--- a/lib/local-execution/src/fwd_bwd_op_task_impl_function.cc
+++ b/lib/task-spec/src/task-spec/fwd_bwd_op_task_impl_function.cc
@@ -1,4 +1,4 @@
-#include "local-execution/fwd_bwd_op_task_impl_function.h"
+#include "task-spec/fwd_bwd_op_task_impl_function.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/generic_task_impl_function.cc b/lib/task-spec/src/task-spec/generic_task_impl_function.cc
similarity index 96%
rename from lib/local-execution/src/generic_task_impl_function.cc
rename to lib/task-spec/src/task-spec/generic_task_impl_function.cc
index 87d4db53e6..4abd1ab644 100644
--- a/lib/local-execution/src/generic_task_impl_function.cc
+++ b/lib/task-spec/src/task-spec/generic_task_impl_function.cc
@@ -1,4 +1,4 @@
-#include "local-execution/generic_task_impl_function.h"
+#include "task-spec/generic_task_impl_function.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/init_op_task_impl_function.cc b/lib/task-spec/src/task-spec/init_op_task_impl_function.cc
similarity index 96%
rename from lib/local-execution/src/init_op_task_impl_function.cc
rename to lib/task-spec/src/task-spec/init_op_task_impl_function.cc
index abe84b828e..4cd55fc488 100644
--- a/lib/local-execution/src/init_op_task_impl_function.cc
+++ b/lib/task-spec/src/task-spec/init_op_task_impl_function.cc
@@ -1,4 +1,4 @@
-#include "local-execution/init_op_task_impl_function.h"
+#include "task-spec/init_op_task_impl_function.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/src/task-spec/itask_argument_accessor.cc b/lib/task-spec/src/task-spec/itask_argument_accessor.cc
new file mode 100644
index 0000000000..c7878b1abc
--- /dev/null
+++ b/lib/task-spec/src/task-spec/itask_argument_accessor.cc
@@ -0,0 +1 @@
+#include "task-spec/itask_argument_accessor.h"
diff --git a/lib/task-spec/src/op_arg_ref.cc b/lib/task-spec/src/task-spec/op_arg_ref.cc
similarity index 100%
rename from lib/task-spec/src/op_arg_ref.cc
rename to lib/task-spec/src/task-spec/op_arg_ref.cc
diff --git a/lib/task-spec/src/op_arg_spec.cc b/lib/task-spec/src/task-spec/op_arg_spec.cc
similarity index 100%
rename from lib/task-spec/src/op_arg_spec.cc
rename to lib/task-spec/src/task-spec/op_arg_spec.cc
diff --git a/lib/task-spec/src/op_task_invocation.cc b/lib/task-spec/src/task-spec/op_task_invocation.cc
similarity index 100%
rename from lib/task-spec/src/op_task_invocation.cc
rename to lib/task-spec/src/task-spec/op_task_invocation.cc
diff --git a/lib/task-spec/src/op_task_signature.cc b/lib/task-spec/src/task-spec/op_task_signature.cc
similarity index 100%
rename from lib/task-spec/src/op_task_signature.cc
rename to lib/task-spec/src/task-spec/op_task_signature.cc
diff --git a/lib/task-spec/src/op_task_to_task_invocation.cc b/lib/task-spec/src/task-spec/op_task_to_task_invocation.cc
similarity index 100%
rename from lib/task-spec/src/op_task_to_task_invocation.cc
rename to lib/task-spec/src/task-spec/op_task_to_task_invocation.cc
diff --git a/lib/task-spec/src/op_tensor_spec.cc b/lib/task-spec/src/task-spec/op_tensor_spec.cc
similarity index 100%
rename from lib/task-spec/src/op_tensor_spec.cc
rename to lib/task-spec/src/task-spec/op_tensor_spec.cc
diff --git a/lib/local-execution/src/local-execution/ops/attention.cc b/lib/task-spec/src/task-spec/ops/attention.cc
similarity index 99%
rename from lib/local-execution/src/local-execution/ops/attention.cc
rename to lib/task-spec/src/task-spec/ops/attention.cc
index a9e6a9fa30..01960803ce 100644
--- a/lib/local-execution/src/local-execution/ops/attention.cc
+++ b/lib/task-spec/src/task-spec/ops/attention.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "local-execution/ops/attention.h"
+#include "task-spec/ops/attention.h"
 #include "kernels/attention_kernels.h"
 #include "op-attrs/ops/attention.h"
 #include "op-attrs/ops/attention/multihead_attention_parallel_inputs.h"
diff --git a/lib/local-execution/src/local-execution/ops/batch_matmul.cc b/lib/task-spec/src/task-spec/ops/batch_matmul.cc
similarity index 99%
rename from lib/local-execution/src/local-execution/ops/batch_matmul.cc
rename to lib/task-spec/src/task-spec/ops/batch_matmul.cc
index 2cbf1cf20f..371c80d7e2 100644
--- a/lib/local-execution/src/local-execution/ops/batch_matmul.cc
+++ b/lib/task-spec/src/task-spec/ops/batch_matmul.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "local-execution/ops/batch_matmul.h"
+#include "task-spec/ops/batch_matmul.h"
 #include "kernels/batch_matmul_kernels.h"
 #include "op-attrs/ops/batch_matmul.h"
 #include "task-spec/op_task_signature.h"
diff --git a/lib/local-execution/src/local-execution/ops/batch_norm.cc b/lib/task-spec/src/task-spec/ops/batch_norm.cc
similarity index 99%
rename from lib/local-execution/src/local-execution/ops/batch_norm.cc
rename to lib/task-spec/src/task-spec/ops/batch_norm.cc
index 7ba62bcc59..2aa308dada 100644
--- a/lib/local-execution/src/local-execution/ops/batch_norm.cc
+++ b/lib/task-spec/src/task-spec/ops/batch_norm.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "local-execution/ops/batch_norm.h"
+#include "task-spec/ops/batch_norm.h"
 #include "kernels/batch_norm_kernels.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/src/local-execution/ops/cast.cc b/lib/task-spec/src/task-spec/ops/cast.cc
similarity index 98%
rename from lib/local-execution/src/local-execution/ops/cast.cc
rename to lib/task-spec/src/task-spec/ops/cast.cc
index 752317d722..7cf26be95b 100644
--- a/lib/local-execution/src/local-execution/ops/cast.cc
+++ b/lib/task-spec/src/task-spec/ops/cast.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "local-execution/ops/cast.h"
+#include "task-spec/ops/cast.h"
 #include "kernels/cast_kernels.h"
 
 #include "task-spec/op_task_signature.h"
diff --git a/lib/local-execution/src/local-execution/ops/combine.cc b/lib/task-spec/src/task-spec/ops/combine.cc
similarity index 98%
rename from lib/local-execution/src/local-execution/ops/combine.cc
rename to lib/task-spec/src/task-spec/ops/combine.cc
index 32fab636d3..41c276facb 100644
--- a/lib/local-execution/src/local-execution/ops/combine.cc
+++ b/lib/task-spec/src/task-spec/ops/combine.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "local-execution/ops/combine.h"
+#include "task-spec/ops/combine.h"
 #include "kernels/combine_kernels.h"
 #include "task-spec/op_task_invocation.h"
 #include "utils/hash-utils.h"
diff --git a/lib/local-execution/src/local-execution/ops/concat.cc b/lib/task-spec/src/task-spec/ops/concat.cc
similarity index 98%
rename from lib/local-execution/src/local-execution/ops/concat.cc
rename to lib/task-spec/src/task-spec/ops/concat.cc
index 8531bf77c0..2cb082d1eb 100644
--- a/lib/local-execution/src/local-execution/ops/concat.cc
+++ b/lib/task-spec/src/task-spec/ops/concat.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "local-execution/ops/concat.h"
+#include "task-spec/ops/concat.h"
 #include "kernels/concat_kernels.h"
 #include "task-spec/op_task_signature.h"
 #include "task-spec/variadic_tensor_ref.h"
diff --git a/lib/local-execution/src/local-execution/ops/conv_2d.cc b/lib/task-spec/src/task-spec/ops/conv_2d.cc
similarity index 99%
rename from lib/local-execution/src/local-execution/ops/conv_2d.cc
rename to lib/task-spec/src/task-spec/ops/conv_2d.cc
index cc0febff24..47b889c6ce 100644
--- a/lib/local-execution/src/local-execution/ops/conv_2d.cc
+++ b/lib/task-spec/src/task-spec/ops/conv_2d.cc
@@ -1,4 +1,4 @@
-#include "local-execution/ops/conv_2d.h"
+#include "task-spec/ops/conv_2d.h"
 #include "kernels/conv_2d_kernels.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/src/local-execution/ops/dropout.cc b/lib/task-spec/src/task-spec/ops/dropout.cc
similarity index 99%
rename from lib/local-execution/src/local-execution/ops/dropout.cc
rename to lib/task-spec/src/task-spec/ops/dropout.cc
index cc09841190..d19ace886b 100644
--- a/lib/local-execution/src/local-execution/ops/dropout.cc
+++ b/lib/task-spec/src/task-spec/ops/dropout.cc
@@ -1,4 +1,4 @@
-#include "local-execution/ops/dropout.h"
+#include "task-spec/ops/dropout.h"
 #include "kernels/dropout_kernels.h"
 #include "task-spec/op_task_invocation.h"
 #include "task-spec/op_task_signature.h"
diff --git a/lib/local-execution/src/local-execution/ops/element_binary.cc b/lib/task-spec/src/task-spec/ops/element_binary.cc
similarity index 98%
rename from lib/local-execution/src/local-execution/ops/element_binary.cc
rename to lib/task-spec/src/task-spec/ops/element_binary.cc
index ec8ed298d0..5356901423 100644
--- a/lib/local-execution/src/local-execution/ops/element_binary.cc
+++ b/lib/task-spec/src/task-spec/ops/element_binary.cc
@@ -1,6 +1,6 @@
-#include "local-execution/ops/element_binary.h"
+#include "task-spec/ops/element_binary.h"
 #include "kernels/element_binary_kernels.h"
-#include "local-execution/task_signature_impl.h"
+#include "task-spec/task_signature_impl.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/src/local-execution/ops/element_unary.cc b/lib/task-spec/src/task-spec/ops/element_unary.cc
similarity index 99%
rename from lib/local-execution/src/local-execution/ops/element_unary.cc
rename to lib/task-spec/src/task-spec/ops/element_unary.cc
index 4cf54e5b38..1f4e651251 100644
--- a/lib/local-execution/src/local-execution/ops/element_unary.cc
+++ b/lib/task-spec/src/task-spec/ops/element_unary.cc
@@ -1,4 +1,4 @@
-#include "local-execution/ops/element_unary.h"
+#include "task-spec/ops/element_unary.h"
 #include "kernels/element_unary_kernels.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "utils/hash-utils.h"
diff --git a/lib/local-execution/src/local-execution/ops/flat.cc b/lib/task-spec/src/task-spec/ops/flat.cc
similarity index 98%
rename from lib/local-execution/src/local-execution/ops/flat.cc
rename to lib/task-spec/src/task-spec/ops/flat.cc
index 414a56769d..1bc0999e1a 100644
--- a/lib/local-execution/src/local-execution/ops/flat.cc
+++ b/lib/task-spec/src/task-spec/ops/flat.cc
@@ -1,4 +1,4 @@
-#include "local-execution/ops/flat.h"
+#include "task-spec/ops/flat.h"
 #include "kernels/flat_kernels.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/src/local-execution/ops/gather.cc b/lib/task-spec/src/task-spec/ops/gather.cc
similarity index 99%
rename from lib/local-execution/src/local-execution/ops/gather.cc
rename to lib/task-spec/src/task-spec/ops/gather.cc
index 7e4b99a557..a0bfaddc0f 100644
--- a/lib/local-execution/src/local-execution/ops/gather.cc
+++ b/lib/task-spec/src/task-spec/ops/gather.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "local-execution/ops/gather.h"
+#include "task-spec/ops/gather.h"
 #include "kernels/gather_kernels.h"
 #include "utils/nonnegative_int/nonnegative_range.h"
 #include <optional>
diff --git a/lib/local-execution/src/local-execution/ops/input.cc b/lib/task-spec/src/task-spec/ops/input.cc
similarity index 76%
rename from lib/local-execution/src/local-execution/ops/input.cc
rename to lib/task-spec/src/task-spec/ops/input.cc
index d7a3888220..53caadfe68 100644
--- a/lib/local-execution/src/local-execution/ops/input.cc
+++ b/lib/task-spec/src/task-spec/ops/input.cc
@@ -1,4 +1,4 @@
-#include "local-execution/ops/input.h"
+#include "task-spec/ops/input.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/local-execution/ops/layer_norm.cc b/lib/task-spec/src/task-spec/ops/layer_norm.cc
similarity index 99%
rename from lib/local-execution/src/local-execution/ops/layer_norm.cc
rename to lib/task-spec/src/task-spec/ops/layer_norm.cc
index d2fc930375..c2f16d7eda 100644
--- a/lib/local-execution/src/local-execution/ops/layer_norm.cc
+++ b/lib/task-spec/src/task-spec/ops/layer_norm.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "local-execution/ops/layer_norm.h"
+#include "task-spec/ops/layer_norm.h"
 #include "kernels/layer_norm_kernels.h"
 #include "op-attrs/ops/layer_norm.h"
 #include "op-attrs/parallel_tensor_shape.h"
diff --git a/lib/local-execution/src/local-execution/ops/linear.cc b/lib/task-spec/src/task-spec/ops/linear.cc
similarity index 98%
rename from lib/local-execution/src/local-execution/ops/linear.cc
rename to lib/task-spec/src/task-spec/ops/linear.cc
index 96fcc85ca1..8d4a81c5c4 100644
--- a/lib/local-execution/src/local-execution/ops/linear.cc
+++ b/lib/task-spec/src/task-spec/ops/linear.cc
@@ -1,6 +1,6 @@
-#include "local-execution/ops/linear.h"
+#include "task-spec/ops/linear.h"
 #include "kernels/linear_kernels.h"
-#include "local-execution/task_argument_accessor.h"
+#include "task-spec/task_argument_accessor.h"
 #include "op-attrs/ff_dim_t.h"
 #include "utils/exception.h"
 #include "utils/hash-utils.h"
diff --git a/lib/local-execution/src/local-execution/ops/noop.cc b/lib/task-spec/src/task-spec/ops/noop.cc
similarity index 95%
rename from lib/local-execution/src/local-execution/ops/noop.cc
rename to lib/task-spec/src/task-spec/ops/noop.cc
index 7357806880..4d69b8fd5f 100644
--- a/lib/local-execution/src/local-execution/ops/noop.cc
+++ b/lib/task-spec/src/task-spec/ops/noop.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "local-execution/ops/noop.h"
+#include "task-spec/ops/noop.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/local-execution/ops/pool_2d.cc b/lib/task-spec/src/task-spec/ops/pool_2d.cc
similarity index 99%
rename from lib/local-execution/src/local-execution/ops/pool_2d.cc
rename to lib/task-spec/src/task-spec/ops/pool_2d.cc
index 6db1cf9dc3..d7064ca04d 100644
--- a/lib/local-execution/src/local-execution/ops/pool_2d.cc
+++ b/lib/task-spec/src/task-spec/ops/pool_2d.cc
@@ -1,6 +1,5 @@
-#include "local-execution/ops/pool_2d.h"
+#include "task-spec/ops/pool_2d.h"
 #include "kernels/pool_2d_kernels.h"
-
 #include "op-attrs/ops/pool_2d.h"
 #include "utils/exception.h"
 #include "utils/hash-utils.h"
diff --git a/lib/local-execution/src/local-execution/ops/reduce.cc b/lib/task-spec/src/task-spec/ops/reduce.cc
similarity index 99%
rename from lib/local-execution/src/local-execution/ops/reduce.cc
rename to lib/task-spec/src/task-spec/ops/reduce.cc
index bc4b5343c2..ccc1285aaa 100644
--- a/lib/local-execution/src/local-execution/ops/reduce.cc
+++ b/lib/task-spec/src/task-spec/ops/reduce.cc
@@ -1,6 +1,5 @@
-#include "local-execution/ops/reduce.h"
+#include "task-spec/ops/reduce.h"
 #include "kernels/reduce_kernels.h"
-
 #include "utils/exception.h"
 #include "utils/hash-utils.h"
 #include "utils/type_traits_core.h"
diff --git a/lib/local-execution/src/local-execution/ops/reduction.cc b/lib/task-spec/src/task-spec/ops/reduction.cc
similarity index 98%
rename from lib/local-execution/src/local-execution/ops/reduction.cc
rename to lib/task-spec/src/task-spec/ops/reduction.cc
index 340f695ffb..96e2c6c506 100644
--- a/lib/local-execution/src/local-execution/ops/reduction.cc
+++ b/lib/task-spec/src/task-spec/ops/reduction.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "local-execution/ops/reduction.h"
+#include "task-spec/ops/reduction.h"
 #include "kernels/reduction_kernels.h"
 #include "utils/exception.h"
 #include "utils/hash-utils.h"
diff --git a/lib/local-execution/src/local-execution/ops/repartition.cc b/lib/task-spec/src/task-spec/ops/repartition.cc
similarity index 99%
rename from lib/local-execution/src/local-execution/ops/repartition.cc
rename to lib/task-spec/src/task-spec/ops/repartition.cc
index 942f2d8fee..cfc45dede7 100644
--- a/lib/local-execution/src/local-execution/ops/repartition.cc
+++ b/lib/task-spec/src/task-spec/ops/repartition.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "local-execution/ops/repartition.h"
+#include "task-spec/ops/repartition.h"
 #include "kernels/partition_kernels.h"
 #include "utils/exception.h"
 #include "utils/hash-utils.h"
diff --git a/lib/local-execution/src/local-execution/ops/replicate.cc b/lib/task-spec/src/task-spec/ops/replicate.cc
similarity index 98%
rename from lib/local-execution/src/local-execution/ops/replicate.cc
rename to lib/task-spec/src/task-spec/ops/replicate.cc
index 13a4fd1635..0ed5d98708 100644
--- a/lib/local-execution/src/local-execution/ops/replicate.cc
+++ b/lib/task-spec/src/task-spec/ops/replicate.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "local-execution/ops/replicate.h"
+#include "task-spec/ops/replicate.h"
 #include "kernels/replicate_kernels.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "utils/exception.h"
diff --git a/lib/local-execution/src/local-execution/ops/reshape.cc b/lib/task-spec/src/task-spec/ops/reshape.cc
similarity index 99%
rename from lib/local-execution/src/local-execution/ops/reshape.cc
rename to lib/task-spec/src/task-spec/ops/reshape.cc
index 294e207f00..0b43f3e31f 100644
--- a/lib/local-execution/src/local-execution/ops/reshape.cc
+++ b/lib/task-spec/src/task-spec/ops/reshape.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "local-execution/ops/reshape.h"
+#include "task-spec/ops/reshape.h"
 #include "kernels/reshape_kernels.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/src/local-execution/ops/reverse.cc b/lib/task-spec/src/task-spec/ops/reverse.cc
similarity index 98%
rename from lib/local-execution/src/local-execution/ops/reverse.cc
rename to lib/task-spec/src/task-spec/ops/reverse.cc
index f3178e86ba..41739d086e 100644
--- a/lib/local-execution/src/local-execution/ops/reverse.cc
+++ b/lib/task-spec/src/task-spec/ops/reverse.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "local-execution/ops/reverse.h"
+#include "task-spec/ops/reverse.h"
 #include "kernels/accessor.h"
 #include "kernels/reverse_kernels.h"
 #include "utils/nonnegative_int/nonnegative_range.h"
diff --git a/lib/local-execution/src/local-execution/ops/softmax.cc b/lib/task-spec/src/task-spec/ops/softmax.cc
similarity index 99%
rename from lib/local-execution/src/local-execution/ops/softmax.cc
rename to lib/task-spec/src/task-spec/ops/softmax.cc
index 4dedff6e18..d7b27fd884 100644
--- a/lib/local-execution/src/local-execution/ops/softmax.cc
+++ b/lib/task-spec/src/task-spec/ops/softmax.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "local-execution/ops/softmax.h"
+#include "task-spec/ops/softmax.h"
 #include "kernels/softmax_kernels.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "utils/exception.h"
diff --git a/lib/local-execution/src/local-execution/ops/split.cc b/lib/task-spec/src/task-spec/ops/split.cc
similarity index 99%
rename from lib/local-execution/src/local-execution/ops/split.cc
rename to lib/task-spec/src/task-spec/ops/split.cc
index 5661fa7381..a14f6a587d 100644
--- a/lib/local-execution/src/local-execution/ops/split.cc
+++ b/lib/task-spec/src/task-spec/ops/split.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "local-execution/ops/split.h"
+#include "task-spec/ops/split.h"
 #include "kernels/array_shape.h"
 #include "kernels/split_kernels.h"
 #include "utils/exception.h"
diff --git a/lib/local-execution/src/local-execution/ops/topk.cc b/lib/task-spec/src/task-spec/ops/topk.cc
similarity index 99%
rename from lib/local-execution/src/local-execution/ops/topk.cc
rename to lib/task-spec/src/task-spec/ops/topk.cc
index fd895605a1..11f1fffa41 100644
--- a/lib/local-execution/src/local-execution/ops/topk.cc
+++ b/lib/task-spec/src/task-spec/ops/topk.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "local-execution/ops/topk.h"
+#include "task-spec/ops/topk.h"
 #include "kernels/topk_kernels.h"
 #include "utils/exception.h"
 
diff --git a/lib/local-execution/src/local-execution/ops/transpose.cc b/lib/task-spec/src/task-spec/ops/transpose.cc
similarity index 98%
rename from lib/local-execution/src/local-execution/ops/transpose.cc
rename to lib/task-spec/src/task-spec/ops/transpose.cc
index c3de935d7c..b6a69b0ed7 100644
--- a/lib/local-execution/src/local-execution/ops/transpose.cc
+++ b/lib/task-spec/src/task-spec/ops/transpose.cc
@@ -13,7 +13,7 @@
  * limitations under the License.
  */
 
-#include "local-execution/ops/transpose.h"
+#include "task-spec/ops/transpose.h"
 #include "kernels/transpose_kernels.h"
 #include "op-attrs/ops/transpose.h"
 #include "utils/integer_conversions.h"
diff --git a/lib/local-execution/src/local-execution/ops/weight.cc b/lib/task-spec/src/task-spec/ops/weight.cc
similarity index 76%
rename from lib/local-execution/src/local-execution/ops/weight.cc
rename to lib/task-spec/src/task-spec/ops/weight.cc
index f96c104f33..08c9be26e9 100644
--- a/lib/local-execution/src/local-execution/ops/weight.cc
+++ b/lib/task-spec/src/task-spec/ops/weight.cc
@@ -1,4 +1,4 @@
-#include "local-execution/ops/weight.h"
+#include "task-spec/ops/weight.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/src/per_device_op_state.cc b/lib/task-spec/src/task-spec/per_device_op_state.cc
similarity index 100%
rename from lib/task-spec/src/per_device_op_state.cc
rename to lib/task-spec/src/task-spec/per_device_op_state.cc
diff --git a/lib/local-execution/src/permissions.cc b/lib/task-spec/src/task-spec/permissions.cc
similarity index 97%
rename from lib/local-execution/src/permissions.cc
rename to lib/task-spec/src/task-spec/permissions.cc
index 2286215987..8b5edb4df1 100644
--- a/lib/local-execution/src/permissions.cc
+++ b/lib/task-spec/src/task-spec/permissions.cc
@@ -1,4 +1,4 @@
-#include "local-execution/permissions.h"
+#include "task-spec/permissions.h"
 #include "utils/exception.h"
 
 namespace FlexFlow {
diff --git a/lib/task-spec/src/task-spec/privilege_tensor_accessor.cc b/lib/task-spec/src/task-spec/privilege_tensor_accessor.cc
new file mode 100644
index 0000000000..a0c55b4dad
--- /dev/null
+++ b/lib/task-spec/src/task-spec/privilege_tensor_accessor.cc
@@ -0,0 +1 @@
+#include "task-spec/privilege_tensor_accessor.h"
diff --git a/lib/task-spec/src/runtime_arg_ref.cc b/lib/task-spec/src/task-spec/runtime_arg_ref.cc
similarity index 100%
rename from lib/task-spec/src/runtime_arg_ref.cc
rename to lib/task-spec/src/task-spec/runtime_arg_ref.cc
diff --git a/lib/task-spec/src/task_arg_spec.cc b/lib/task-spec/src/task-spec/task_arg_spec.cc
similarity index 100%
rename from lib/task-spec/src/task_arg_spec.cc
rename to lib/task-spec/src/task-spec/task_arg_spec.cc
diff --git a/lib/task-spec/src/task-spec/task_argument_accessor.cc b/lib/task-spec/src/task-spec/task_argument_accessor.cc
new file mode 100644
index 0000000000..cee9fc0708
--- /dev/null
+++ b/lib/task-spec/src/task-spec/task_argument_accessor.cc
@@ -0,0 +1 @@
+#include "task-spec/task_argument_accessor.h"
diff --git a/lib/task-spec/src/task_invocation.cc b/lib/task-spec/src/task-spec/task_invocation.cc
similarity index 100%
rename from lib/task-spec/src/task_invocation.cc
rename to lib/task-spec/src/task-spec/task_invocation.cc
diff --git a/lib/task-spec/src/task_signature.cc b/lib/task-spec/src/task-spec/task_signature.cc
similarity index 100%
rename from lib/task-spec/src/task_signature.cc
rename to lib/task-spec/src/task-spec/task_signature.cc
diff --git a/lib/local-execution/src/task_signature_impl.cc b/lib/task-spec/src/task-spec/task_signature_impl.cc
similarity index 93%
rename from lib/local-execution/src/task_signature_impl.cc
rename to lib/task-spec/src/task-spec/task_signature_impl.cc
index 9031d2a015..7995c0af0b 100644
--- a/lib/local-execution/src/task_signature_impl.cc
+++ b/lib/task-spec/src/task-spec/task_signature_impl.cc
@@ -1,33 +1,33 @@
-#include "local-execution/task_signature_impl.h"
-#include "local-execution/ops/attention.h"
-#include "local-execution/ops/batch_matmul.h"
-#include "local-execution/ops/batch_norm.h"
-#include "local-execution/ops/cast.h"
-#include "local-execution/ops/combine.h"
-#include "local-execution/ops/concat.h"
-#include "local-execution/ops/conv_2d.h"
-#include "local-execution/ops/dropout.h"
-#include "local-execution/ops/element_binary.h"
-#include "local-execution/ops/element_unary.h"
-#include "local-execution/ops/embedding.h"
-#include "local-execution/ops/flat.h"
-#include "local-execution/ops/gather.h"
-#include "local-execution/ops/input.h"
-#include "local-execution/ops/layer_norm.h"
-#include "local-execution/ops/linear.h"
-#include "local-execution/ops/noop.h"
-#include "local-execution/ops/pool_2d.h"
-#include "local-execution/ops/reduce.h"
-#include "local-execution/ops/reduction.h"
-#include "local-execution/ops/repartition.h"
-#include "local-execution/ops/replicate.h"
-#include "local-execution/ops/reshape.h"
-#include "local-execution/ops/reverse.h"
-#include "local-execution/ops/softmax.h"
-#include "local-execution/ops/split.h"
-#include "local-execution/ops/topk.h"
-#include "local-execution/ops/transpose.h"
-#include "local-execution/ops/weight.h"
+#include "task-spec/task_signature_impl.h"
+#include "task-spec/ops/attention.h"
+#include "task-spec/ops/batch_matmul.h"
+#include "task-spec/ops/batch_norm.h"
+#include "task-spec/ops/cast.h"
+#include "task-spec/ops/combine.h"
+#include "task-spec/ops/concat.h"
+#include "task-spec/ops/conv_2d.h"
+#include "task-spec/ops/dropout.h"
+#include "task-spec/ops/element_binary.h"
+#include "task-spec/ops/element_unary.h"
+#include "task-spec/ops/embedding.h"
+#include "task-spec/ops/flat.h"
+#include "task-spec/ops/gather.h"
+#include "task-spec/ops/input.h"
+#include "task-spec/ops/layer_norm.h"
+#include "task-spec/ops/linear.h"
+#include "task-spec/ops/noop.h"
+#include "task-spec/ops/pool_2d.h"
+#include "task-spec/ops/reduce.h"
+#include "task-spec/ops/reduction.h"
+#include "task-spec/ops/repartition.h"
+#include "task-spec/ops/replicate.h"
+#include "task-spec/ops/reshape.h"
+#include "task-spec/ops/reverse.h"
+#include "task-spec/ops/softmax.h"
+#include "task-spec/ops/split.h"
+#include "task-spec/ops/topk.h"
+#include "task-spec/ops/transpose.h"
+#include "task-spec/ops/weight.h"
 #include "utils/overload.h"
 
 namespace FlexFlow {
diff --git a/lib/task-spec/src/variadic_tensor_ref.cc b/lib/task-spec/src/task-spec/variadic_tensor_ref.cc
similarity index 100%
rename from lib/task-spec/src/variadic_tensor_ref.cc
rename to lib/task-spec/src/task-spec/variadic_tensor_ref.cc
diff --git a/lib/task-spec/test/CMakeLists.txt b/lib/task-spec/test/CMakeLists.txt
new file mode 100644
index 0000000000..87abf10401
--- /dev/null
+++ b/lib/task-spec/test/CMakeLists.txt
@@ -0,0 +1,14 @@
+ff_add_test_executable(
+  NAME
+    task-spec-tests
+  SRC_PATTERNS
+    src/*.cc
+  PRIVATE_INCLUDE 
+    src/
+  DEPS
+    doctest
+    utils-test-common
+    local-execution
+    kernels
+    op-attrs
+)
diff --git a/lib/task-spec/test/src/task-spec/arg_ref.cc b/lib/task-spec/test/src/task-spec/arg_ref.cc
new file mode 100644
index 0000000000..e1c5a9bd8d
--- /dev/null
+++ b/lib/task-spec/test/src/task-spec/arg_ref.cc
@@ -0,0 +1,33 @@
+#include <doctest/doctest.h>
+#include "task-spec/arg_ref.h"
+#include <string>
+
+using namespace ::FlexFlow;
+
+enum class ExampleLabelType { 
+  STRING,
+};
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("ArgRefSpec::holds") {
+    CHECK_MESSAGE(false, "TODO: ArgRefSpec");
+
+    ArgRefSpec<ExampleLabelType> arg_ref_spec = ArgRefSpec<ExampleLabelType>::create(
+      ArgRef<ExampleLabelType, std::string>{ExampleLabelType::STRING}
+    );
+
+    SUBCASE("returns true if the type matches the ArgRef type") {
+      bool result = arg_ref_spec.holds<std::string>();
+      bool correct = true; 
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("returns false otherwise") {
+      bool result = arg_ref_spec.holds<int>();
+      bool correct = false;
+
+      CHECK(result == correct);
+    }
+  }
+}

From 292c61c754c85d2c310fe06b56b0716e467f1d2a Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Wed, 14 May 2025 18:13:26 -0700
Subject: [PATCH 71/91] Sync changes with Reyna

---
 lib/kernels/include/kernels/array_coord.h     |  13 ++
 .../kernels/compare_tensor_accessors.h        |  35 +++
 .../kernels/create_accessor_with_contents.h   | 214 ++++++++++++++++++
 .../kernels/managed_per_device_ff_handle.h    |   7 +-
 .../include/kernels/map_tensor_accessors.h    |  93 ++++++++
 .../include/kernels/reduce_tensor_accessors.h |  39 ++++
 lib/kernels/src/kernels/array_coord.cc        |  20 ++
 .../src/kernels/compare_tensor_accessors.cc   |  50 ++++
 .../kernels/create_accessor_with_contents.cc  |  44 ++++
 .../src/kernels/map_tensor_accessors.cc       |  26 +++
 .../test/src/cpu/ops/replicate_kernels.cc     |  11 +-
 .../test/src/cpu/ops/reverse_kernels.cc       |  40 ++--
 lib/kernels/test/src/internal/test_utils.cc   | 192 ----------------
 lib/kernels/test/src/internal/test_utils.h    |  26 ---
 lib/kernels/test/src/kernels/array_coord.cc   |  44 ++++
 .../src/kernels/compare_tensor_accessors.cc   |  57 +++++
 .../src/kernels/format_accessor_contents.cc   |   9 +-
 lib/kernels/test/src/test_attention_kernel.cc |   5 +-
 .../test/src/test_batch_matmul_kernel.cc      |   5 +-
 .../test/src/test_batch_norm_kernel.cc        |   7 +-
 lib/kernels/test/src/test_combine_kernel.cc   |   7 +-
 lib/kernels/test/src/test_concat_kernel.cc    |   5 +-
 lib/kernels/test/src/test_dropout.cc          |   5 +-
 lib/kernels/test/src/test_flat_kernel.cc      |   5 +-
 lib/kernels/test/src/test_gather_kernels.cc   |   5 +-
 .../test/src/test_layer_norm_kernels.cc       |   5 +-
 .../test/src/test_managed_ff_stream.cc        |   7 +-
 .../src/test_managed_per_device_ff_handle.cc  |  15 +-
 lib/kernels/test/src/test_partition_kernel.cc |   5 +-
 lib/kernels/test/src/test_pool_2d_kernels.cc  |   5 +-
 lib/kernels/test/src/test_reduction_kernel.cc |   5 +-
 lib/kernels/test/src/test_replicate_kernel.cc |  19 +-
 lib/kernels/test/src/test_reshape_kernel.cc   |   5 +-
 lib/kernels/test/src/test_reverse_kernels.cc  |  12 +-
 lib/kernels/test/src/test_softmax_kernel.cc   |   5 +-
 lib/kernels/test/src/test_split_kernel.cc     |   5 +-
 lib/kernels/test/src/test_transpose_kernel.cc |   5 +-
 .../local-execution/model_training_instance.h |   2 +-
 .../src/model_training_instance.cc            |   4 +-
 lib/local-execution/test/src/test_e2e.cc      |  34 +--
 .../test/src/test_local_cost_estimator.cc     |   6 +-
 .../test/src/test_loss_functions.cc           |   5 +-
 lib/local-execution/test/src/test_update.cc   |   5 +-
 lib/op-attrs/include/op-attrs/datatype.h      |  60 +++--
 44 files changed, 846 insertions(+), 327 deletions(-)
 create mode 100644 lib/kernels/include/kernels/array_coord.h
 create mode 100644 lib/kernels/include/kernels/compare_tensor_accessors.h
 create mode 100644 lib/kernels/include/kernels/create_accessor_with_contents.h
 create mode 100644 lib/kernels/include/kernels/map_tensor_accessors.h
 create mode 100644 lib/kernels/include/kernels/reduce_tensor_accessors.h
 create mode 100644 lib/kernels/src/kernels/array_coord.cc
 create mode 100644 lib/kernels/src/kernels/compare_tensor_accessors.cc
 create mode 100644 lib/kernels/src/kernels/create_accessor_with_contents.cc
 create mode 100644 lib/kernels/src/kernels/map_tensor_accessors.cc
 create mode 100644 lib/kernels/test/src/kernels/array_coord.cc
 create mode 100644 lib/kernels/test/src/kernels/compare_tensor_accessors.cc

diff --git a/lib/kernels/include/kernels/array_coord.h b/lib/kernels/include/kernels/array_coord.h
new file mode 100644
index 0000000000..f739a3d707
--- /dev/null
+++ b/lib/kernels/include/kernels/array_coord.h
@@ -0,0 +1,13 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ARRAY_COORD_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ARRAY_COORD_H
+
+#include "kernels/array_coord.dtg.h"
+
+namespace FlexFlow {
+
+ArrayCoord array_coord_drop_dims(ArrayCoord const &,
+                                 std::function<bool(ff_dim_t)> const &should_drop_dim);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/compare_tensor_accessors.h b/lib/kernels/include/kernels/compare_tensor_accessors.h
new file mode 100644
index 0000000000..ee438505fb
--- /dev/null
+++ b/lib/kernels/include/kernels/compare_tensor_accessors.h
@@ -0,0 +1,35 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_COMPARE_TENSOR_ACCESSORS_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_COMPARE_TENSOR_ACCESSORS_H
+
+#include "kernels/accessor.h"
+#include "kernels/allocation.h"
+
+namespace FlexFlow {
+
+GenericTensorAccessorW compare_tensor_accessors_lt(GenericTensorAccessorR const &lhs,
+                                                   GenericTensorAccessorR const &rhs,
+                                                   Allocator &allocator);
+
+GenericTensorAccessorW compare_tensor_accessors_le(GenericTensorAccessorR const &lhs,
+                                                   GenericTensorAccessorR const &rhs,
+                                                   Allocator &allocator);
+
+GenericTensorAccessorW compare_tensor_accessors_gt(GenericTensorAccessorR const &lhs,
+                                                   GenericTensorAccessorR const &rhs,
+                                                   Allocator &allocator);
+
+GenericTensorAccessorW compare_tensor_accessors_ge(GenericTensorAccessorR const &lhs,
+                                                   GenericTensorAccessorR const &rhs,
+                                                   Allocator &allocator);
+
+GenericTensorAccessorW compare_tensor_accessors_eq(GenericTensorAccessorR const &lhs,
+                                                   GenericTensorAccessorR const &rhs,
+                                                   Allocator &allocator);
+
+GenericTensorAccessorW compare_tensor_accessors_ne(GenericTensorAccessorR const &lhs,
+                                                   GenericTensorAccessorR const &rhs,
+                                                   Allocator &allocator);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/create_accessor_with_contents.h b/lib/kernels/include/kernels/create_accessor_with_contents.h
new file mode 100644
index 0000000000..fc07d432b2
--- /dev/null
+++ b/lib/kernels/include/kernels/create_accessor_with_contents.h
@@ -0,0 +1,214 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_CREATE_ACCESSOR_WITH_CONTENTS_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_CREATE_ACCESSOR_WITH_CONTENTS_H
+
+#include "kernels/accessor.h"
+#include "kernels/allocation.h"
+#include "kernels/local_cpu_allocator.h"
+#include "utils/containers/require_all_same1.h"
+
+namespace FlexFlow {
+
+template <typename T>
+GenericTensorAccessorW
+    create_1d_accessor_w_with_contents(std::vector<T> const &contents,
+                                       Allocator &allocator) {
+  nonnegative_int ncols = num_elements(contents);
+  ASSERT(ncols > 0);
+
+  TensorShape shape = TensorShape{
+      TensorDims{FFOrdered{ncols}},
+      type_to_data_type_enum_v<T>,
+  };
+
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape);
+
+  for (nonnegative_int col_idx : nonnegative_range(ncols)) {
+    cpu_accessor.at<type_to_data_type_enum_v<T>>(FFOrdered{col_idx}) =
+        contents.at(col_idx.unwrap_nonnegative());
+  }
+
+  GenericTensorAccessorW result = allocator.allocate_tensor(shape);
+  copy_accessor_data_to_l_from_r(
+      result, read_only_accessor_from_write_accessor(cpu_accessor));
+
+  return result;
+}
+
+template <typename T>
+GenericTensorAccessorW create_2d_accessor_w_with_contents(
+    std::vector<std::vector<T>> const &contents, Allocator &allocator) {
+  nonnegative_int nrows = num_elements(contents);
+  ASSERT(nrows > 0);
+
+  nonnegative_int ncols = throw_if_unexpected(
+      require_all_same1(transform(contents, [](std::vector<T> const &row) {
+        return num_elements(row);
+      })));
+  ASSERT(ncols > 0);
+
+  TensorShape shape = TensorShape{
+      TensorDims{FFOrdered{nrows, ncols}},
+      type_to_data_type_enum_v<T>,
+  };
+
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape);
+
+  for (nonnegative_int row_idx : nonnegative_range(nrows)) {
+    for (nonnegative_int col_idx : nonnegative_range(ncols)) {
+      cpu_accessor.at<type_to_data_type_enum_v<T>>(FFOrdered{row_idx, col_idx}) =
+          contents.at(row_idx.unwrap_nonnegative())
+              .at(col_idx.unwrap_nonnegative());
+    }
+  }
+
+  GenericTensorAccessorW result = allocator.allocate_tensor(shape);
+  copy_accessor_data_to_l_from_r(
+      result, read_only_accessor_from_write_accessor(cpu_accessor));
+
+  return result;
+}
+
+template <typename T>
+GenericTensorAccessorW create_3d_accessor_w_with_contents(
+    std::vector<std::vector<std::vector<T>>> const &contents,
+    Allocator &allocator) {
+  nonnegative_int dim0_size = num_elements(contents);
+  ASSERT(dim0_size > 0);
+
+  nonnegative_int dim1_size = throw_if_unexpected(require_all_same1(
+      transform(contents, [](std::vector<std::vector<T>> const &m) {
+        return num_elements(m);
+      })));
+  ASSERT(dim1_size > 0);
+
+  nonnegative_int dim2_size = throw_if_unexpected(require_all_same1(
+      transform(contents, [](std::vector<std::vector<T>> const &m) {
+        return throw_if_unexpected(
+            require_all_same1(transform(m, [](std::vector<T> const &vec) {
+              return num_elements(vec);
+            })));
+      })));
+  ASSERT(dim2_size > 0);
+
+  TensorShape shape = TensorShape{
+      TensorDims{FFOrdered{dim0_size, dim1_size, dim2_size}},
+      type_to_data_type_enum_v<T>,
+  };
+
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape);
+
+  for (nonnegative_int dim0_idx : nonnegative_range(dim0_size)) {
+    for (nonnegative_int dim1_idx : nonnegative_range(dim1_size)) {
+      for (nonnegative_int dim2_idx : nonnegative_range(dim2_size)) {
+        cpu_accessor.at<type_to_data_type_enum_v<T>>(
+            FFOrdered{dim0_idx, dim1_idx, dim2_idx}) =
+            contents.at(dim0_idx.unwrap_nonnegative())
+                .at(dim1_idx.unwrap_nonnegative())
+                .at(dim2_idx.unwrap_nonnegative());
+      }
+    }
+  }
+
+  GenericTensorAccessorW result = allocator.allocate_tensor(shape);
+  copy_accessor_data_to_l_from_r(
+      result, read_only_accessor_from_write_accessor(cpu_accessor));
+
+  return result;
+}
+
+template <typename T>
+GenericTensorAccessorW create_4d_accessor_w_with_contents(
+    std::vector<std::vector<std::vector<std::vector<T>>>> const &contents,
+    Allocator &allocator) {
+  nonnegative_int dim0_size = num_elements(contents);
+  ASSERT(dim0_size > 0);
+
+  nonnegative_int dim1_size = throw_if_unexpected(require_all_same1(transform(
+      contents, [](std::vector<std::vector<std::vector<T>>> const &t) {
+        return num_elements(t);
+      })));
+  ASSERT(dim1_size > 0);
+
+  nonnegative_int dim2_size = throw_if_unexpected(require_all_same1(transform(
+      contents, [](std::vector<std::vector<std::vector<T>>> const &m) {
+        return throw_if_unexpected(require_all_same1(
+            transform(m, [](std::vector<std::vector<T>> const &vec) {
+              return num_elements(vec);
+            })));
+      })));
+  ASSERT(dim2_size > 0);
+
+  nonnegative_int dim3_size = throw_if_unexpected(require_all_same1(transform(
+      contents, [](std::vector<std::vector<std::vector<T>>> const &t) {
+        return throw_if_unexpected(require_all_same1(
+            transform(t, [](std::vector<std::vector<T>> const &mat) {
+              return throw_if_unexpected(require_all_same1(
+                  transform(mat, [](std::vector<T> const &vec) {
+                    return num_elements(vec);
+                  })));
+            })));
+      })));
+  ASSERT(dim3_size > 0);
+
+  TensorShape shape = TensorShape{
+      TensorDims{FFOrdered{dim0_size, dim1_size, dim2_size, dim3_size}},
+      type_to_data_type_enum_v<T>,
+  };
+
+  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
+
+  for (nonnegative_int dim0_idx : nonnegative_range(dim0_size)) {
+    for (nonnegative_int dim1_idx : nonnegative_range(dim1_size)) {
+      for (nonnegative_int dim2_idx : nonnegative_range(dim2_size)) {
+        for (nonnegative_int dim3_idx : nonnegative_range(dim3_size)) {
+          accessor.at<type_to_data_type_enum_v<T>>(
+              FFOrdered{dim0_idx, dim1_idx, dim2_idx, dim3_idx}) =
+              contents.at(dim0_idx.unwrap_nonnegative())
+                  .at(dim1_idx.unwrap_nonnegative())
+                  .at(dim2_idx.unwrap_nonnegative())
+                  .at(dim3_idx.unwrap_nonnegative());
+        }
+      }
+    }
+  }
+
+  return accessor;
+}
+
+template <typename T>
+GenericTensorAccessorR
+    create_1d_accessor_r_with_contents(std::vector<T> const &contents,
+                                       Allocator &allocator) {
+  return read_only_accessor_from_write_accessor(
+      create_1d_accessor_w_with_contents(contents, allocator));
+}
+
+template <typename T>
+GenericTensorAccessorR create_2d_accessor_r_with_contents(
+    std::vector<std::vector<T>> const &contents, Allocator &allocator) {
+  return read_only_accessor_from_write_accessor(
+      create_2d_accessor_w_with_contents(contents, allocator));
+}
+
+template <typename T>
+GenericTensorAccessorR create_3d_accessor_r_with_contents(
+    std::vector<std::vector<std::vector<T>>> const &contents,
+    Allocator &allocator) {
+  return read_only_accessor_from_write_accessor(
+      create_3d_accessor_w_with_contents(contents, allocator));
+}
+
+template <typename T>
+GenericTensorAccessorR create_4d_accessor_r_with_contents(
+    std::vector<std::vector<std::vector<std::vector<T>>>> const &contents,
+    Allocator &allocator) {
+  return read_only_accessor_from_write_accessor(
+      create_4d_accessor_w_with_contents(contents, allocator));
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h
index 0226b1a76c..d409ec19ad 100644
--- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h
+++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h
@@ -33,9 +33,12 @@ struct ManagedPerDeviceFFHandle {
   PerDeviceFFHandle *handle;
 };
 
-ManagedPerDeviceFFHandle initialize_single_gpu_handle();
+ManagedPerDeviceFFHandle initialize_single_gpu_handle(size_t workSpaceSize, 
+                                                      bool allowTensorOpMathConversion);
 ManagedPerDeviceFFHandle initialize_multi_gpu_handle(int num_ranks,
-                                                     int my_rank);
+                                                     int my_rank,
+                                                     size_t workSpaceSize,
+                                                     bool allowTensorOpMathConversion);
 
 } // namespace FlexFlow
 
diff --git a/lib/kernels/include/kernels/map_tensor_accessors.h b/lib/kernels/include/kernels/map_tensor_accessors.h
new file mode 100644
index 0000000000..8447c60892
--- /dev/null
+++ b/lib/kernels/include/kernels/map_tensor_accessors.h
@@ -0,0 +1,93 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_MAP_TENSOR_ACCESSORS_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_MAP_TENSOR_ACCESSORS_H
+
+#include "kernels/accessor.h"
+#include "kernels/allocation.h"
+#include "kernels/local_cpu_allocator.h"
+#include "kernels/copy_tensor_accessor.h"
+#include "kernels/datatype_dispatch.h"
+#include "utils/containers/require_same.h"
+#include "utils/containers/require_all_same1.h"
+
+namespace FlexFlow {
+
+template <DataType DT>
+struct CPUMapTensorAccessor {
+  template <typename F>
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW &output,
+                  F &&f) {
+    ArrayShape shape = require_same(input.shape, output.shape);
+
+    ASSERT(input.device_type == DeviceType::CPU);
+    ASSERT(output.device_type == DeviceType::CPU);
+
+    for (ArrayCoord const &coord : get_array_coord_set(shape)) {
+      output.at<DataType::BOOL>(coord.ff_ordered) 
+        = f(input.at<DT>(coord.ff_ordered));
+    }
+  }
+};
+
+template <typename F, typename Out = std::invoke_result_t<F, float>>
+GenericTensorAccessorW map_tensor_accessor(GenericTensorAccessorR const &input,
+                                           Allocator &output_allocator,
+                                           F &&f) {
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorR input_cpu = copy_tensor_accessor_r_to_cpu_if_necessary(input, cpu_allocator);
+
+  GenericTensorAccessorW output_cpu = cpu_allocator.allocate_tensor(get_tensor_shape(input.shape, type_to_data_type_enum_v<Out>));
+
+  DataTypeDispatch1<CPUMapTensorAccessor>{}(input.data_type, input_cpu, output_cpu, f);
+
+  return copy_tensor_accessor_w(output_cpu, output_allocator);
+}
+
+template <DataType DT>
+struct CPUMapTensorAccessors2 {
+  template <typename F, typename Out = std::invoke_result_t<F, float, float>>
+  void operator()(GenericTensorAccessorR const &lhs,
+                  GenericTensorAccessorR const &rhs,
+                  GenericTensorAccessorW &output,
+                  F &&f) {
+
+    ArrayShape shape = throw_if_unexpected(require_all_same1(std::vector{
+      lhs.shape,
+      rhs.shape,
+      output.shape,
+    }));
+
+    ASSERT(lhs.device_type == DeviceType::CPU);
+    ASSERT(rhs.device_type == DeviceType::CPU);
+    ASSERT(output.device_type == DeviceType::CPU);
+
+    for (ArrayCoord const &coord : get_array_coord_set(shape)) {
+      output.at<type_to_data_type_enum_v<Out>>(coord.ff_ordered) 
+        = f(lhs.at<DT>(coord.ff_ordered), rhs.at<DT>(coord.ff_ordered));
+    }
+  }
+};
+
+template <typename F, typename Out = std::invoke_result_t<F, float, float>>
+GenericTensorAccessorW map_tensor_accessors2(GenericTensorAccessorR const &lhs,
+                                             GenericTensorAccessorR const &rhs,
+                                             Allocator &output_allocator,
+                                             F &&f) {
+  ArrayShape shape = require_same(lhs.shape, rhs.shape);
+  DataType input_data_type = require_same(lhs.data_type, rhs.data_type);
+
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorR lhs_cpu = copy_tensor_accessor_r_to_cpu_if_necessary(lhs, cpu_allocator);
+  GenericTensorAccessorR rhs_cpu = copy_tensor_accessor_r_to_cpu_if_necessary(rhs, cpu_allocator);
+  DataType output_data_type = type_to_data_type_enum_v<Out>;
+  GenericTensorAccessorW output_cpu = cpu_allocator.allocate_tensor(get_tensor_shape(shape, output_data_type));
+
+  DataTypeDispatch1<CPUMapTensorAccessors2>{}(input_data_type, lhs_cpu, rhs_cpu, output_cpu, f);
+
+  return copy_tensor_accessor_w(output_cpu, output_allocator);
+}
+
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/reduce_tensor_accessors.h b/lib/kernels/include/kernels/reduce_tensor_accessors.h
new file mode 100644
index 0000000000..c80c41778f
--- /dev/null
+++ b/lib/kernels/include/kernels/reduce_tensor_accessors.h
@@ -0,0 +1,39 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REDUCE_TENSOR_ACCESSORS_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REDUCE_TENSOR_ACCESSORS_H
+
+#include "kernels/accessor.h"
+#include "kenrels/allocation.h"
+
+namespace FlexFlow {
+
+
+
+template <typename DTIn, typename DTOut>
+struct CPUReduceTensorAccessorInDims {
+  template <typename F>
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW &output,
+                  std::unordered_set<ff_dim_t> const &dims_to_reduce,
+                  F &&f) {
+    
+    ASSERT(input.device_type == DeviceType::CPU);
+    ASSERT(output.device_type == DeviceType::CPU);
+
+    for (ArrayCoord const &coord : get_array_coord_set(input.shape)) {
+      output.at<type_to_data_type_enum_v<DTOut>>(coord)
+    }
+  }
+};
+
+template <typename F>
+GenericTensorAccessorW reduce_tensor_accessor_in_dims(std::unordered_set<ff_dim_t> const &dims,
+                                                      F &&f) {
+  
+}
+
+GenericTensorAccessorW reduce_tensor_accessor_all(GenericTensorAcessorR const &input,
+                                                  Allocator &allocator);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/src/kernels/array_coord.cc b/lib/kernels/src/kernels/array_coord.cc
new file mode 100644
index 0000000000..60bb19351c
--- /dev/null
+++ b/lib/kernels/src/kernels/array_coord.cc
@@ -0,0 +1,20 @@
+#include "kernels/array_coord.h"
+#include "op-attrs/ff_ordered/ff_ordered_of.h"
+#include "op-attrs/ff_ordered/get_idxs.h"
+#include <vector>
+
+namespace FlexFlow {
+
+ArrayCoord array_coord_drop_dims(ArrayCoord const &coord,
+                                 std::function<bool(ff_dim_t)> const &should_drop_dim) {
+  std::vector<nonnegative_int> result;
+  for (ff_dim_t idx : get_idxs(coord.ff_ordered)) {
+    if (!should_drop_dim(idx)) {
+      result.push_back(coord.ff_ordered.at(idx));
+    }
+  }
+
+  return ArrayCoord{ff_ordered_of(result)};
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/compare_tensor_accessors.cc b/lib/kernels/src/kernels/compare_tensor_accessors.cc
new file mode 100644
index 0000000000..4594fed322
--- /dev/null
+++ b/lib/kernels/src/kernels/compare_tensor_accessors.cc
@@ -0,0 +1,50 @@
+#include "kernels/compare_tensor_accessors.h"
+#include "kernels/map_tensor_accessors.h"
+
+namespace FlexFlow {
+
+GenericTensorAccessorW compare_tensor_accessors_lt(GenericTensorAccessorR const &lhs,
+                                                   GenericTensorAccessorR const &rhs,
+                                                   Allocator &output_allocator) {
+  return map_tensor_accessors2(lhs, rhs, output_allocator, 
+                               [](auto const &l, auto const &r) { return l < r; });
+}
+
+GenericTensorAccessorW compare_tensor_accessors_le(GenericTensorAccessorR const &lhs,
+                                                   GenericTensorAccessorR const &rhs,
+                                                   Allocator &output_allocator) {
+  return map_tensor_accessors2(lhs, rhs, output_allocator, 
+                               [](auto const &l, auto const &r) { return l <= r; });
+}
+
+
+GenericTensorAccessorW compare_tensor_accessors_gt(GenericTensorAccessorR const &lhs,
+                                                   GenericTensorAccessorR const &rhs,
+                                                   Allocator &output_allocator) {
+  return map_tensor_accessors2(lhs, rhs, output_allocator, 
+                               [](auto const &l, auto const &r) { return l > r; });
+}
+
+GenericTensorAccessorW compare_tensor_accessors_ge(GenericTensorAccessorR const &lhs,
+                                                   GenericTensorAccessorR const &rhs,
+                                                   Allocator &output_allocator) {
+  return map_tensor_accessors2(lhs, rhs, output_allocator, 
+                               [](auto const &l, auto const &r) { return l >= r; });
+}
+
+GenericTensorAccessorW compare_tensor_accessors_eq(GenericTensorAccessorR const &lhs,
+                                                   GenericTensorAccessorR const &rhs,
+                                                   Allocator &output_allocator) {
+  return map_tensor_accessors2(lhs, rhs, output_allocator, 
+                               [](auto const &l, auto const &r) { return l == r; });
+}
+
+
+GenericTensorAccessorW compare_tensor_accessors_ne(GenericTensorAccessorR const &lhs,
+                                                   GenericTensorAccessorR const &rhs,
+                                                   Allocator &output_allocator) {
+  return map_tensor_accessors2(lhs, rhs, output_allocator, 
+                               [](auto const &l, auto const &r) { return l != r; });
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/create_accessor_with_contents.cc b/lib/kernels/src/kernels/create_accessor_with_contents.cc
new file mode 100644
index 0000000000..f8b85baa4a
--- /dev/null
+++ b/lib/kernels/src/kernels/create_accessor_with_contents.cc
@@ -0,0 +1,44 @@
+#include "kernels/create_accessor_with_contents.h"
+
+namespace FlexFlow {
+
+template
+  GenericTensorAccessorW
+    create_1d_accessor_w_with_contents(std::vector<bool> const &,
+                                       Allocator &);
+
+template
+  GenericTensorAccessorW create_2d_accessor_w_with_contents(
+      std::vector<std::vector<bool>> const &, Allocator &);
+
+template
+  GenericTensorAccessorW create_3d_accessor_w_with_contents(
+      std::vector<std::vector<std::vector<bool>>> const &,
+      Allocator &);
+
+template
+  GenericTensorAccessorW create_4d_accessor_w_with_contents(
+      std::vector<std::vector<std::vector<std::vector<bool>>>> const &,
+      Allocator &);
+
+template
+  GenericTensorAccessorR
+    create_1d_accessor_r_with_contents(std::vector<bool> const &,
+                                       Allocator &);
+
+template
+  GenericTensorAccessorR create_2d_accessor_r_with_contents(
+      std::vector<std::vector<bool>> const &, Allocator &);
+
+template
+  GenericTensorAccessorR create_3d_accessor_r_with_contents(
+      std::vector<std::vector<std::vector<bool>>> const &,
+      Allocator &);
+
+template
+  GenericTensorAccessorR create_4d_accessor_r_with_contents(
+      std::vector<std::vector<std::vector<std::vector<bool>>>> const &,
+      Allocator &);
+
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/map_tensor_accessors.cc b/lib/kernels/src/kernels/map_tensor_accessors.cc
new file mode 100644
index 0000000000..619f1cc412
--- /dev/null
+++ b/lib/kernels/src/kernels/map_tensor_accessors.cc
@@ -0,0 +1,26 @@
+#include "kernels/map_tensor_accessors.h"
+
+namespace FlexFlow {
+
+struct F1 {
+  template <typename T>
+  float operator()(T const &t) const { NOT_IMPLEMENTED(); }
+};
+
+template
+GenericTensorAccessorW map_tensor_accessor(GenericTensorAccessorR const &,
+                                           Allocator &,
+                                           F1 &&);
+
+struct F2 {
+  template <typename T>
+  float operator()(T const &lhs, T const &rhs) const { NOT_IMPLEMENTED(); }
+};
+
+template
+  GenericTensorAccessorW map_tensor_accessors2(GenericTensorAccessorR const &,
+                                               GenericTensorAccessorR const &,
+                                               Allocator &,
+                                               F2 &&);
+
+} // namespace FlexFlow
diff --git a/lib/kernels/test/src/cpu/ops/replicate_kernels.cc b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc
index 8630dcd8cd..6c35185524 100644
--- a/lib/kernels/test/src/cpu/ops/replicate_kernels.cc
+++ b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc
@@ -1,4 +1,5 @@
 #include "internal/test_utils.h"
+#include "kernels/create_accessor_with_contents.h"
 #include "kernels/format_accessor_contents.h"
 #include "kernels/replicate_kernels_cpu.h"
 #include "test/utils/doctest/check_kv.h"
@@ -11,11 +12,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
     GenericTensorAccessorR input =
-        create_1d_accessor_r_with_contents({1, 3, 2}, cpu_allocator);
+        create_1d_accessor_r_with_contents<int32_t>({1, 3, 2}, cpu_allocator);
 
     TensorShape result_shape = TensorShape{
         TensorDims{FFOrdered{3_n}},
-        DataType::FLOAT,
+        DataType::INT32,
     };
     GenericTensorAccessorW result =
         create_zero_filled_accessor_w(result_shape, cpu_allocator);
@@ -32,7 +33,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Replicate::cpu_backward_kernel") {
     Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
-    GenericTensorAccessorR output = create_2d_accessor_r_with_contents(
+    GenericTensorAccessorR output = create_2d_accessor_r_with_contents<int32_t>(
         {
             {1, 2, 3},
             {4, 3, 3},
@@ -40,12 +41,12 @@ TEST_SUITE(FF_TEST_SUITE) {
         },
         cpu_allocator);
 
-    GenericTensorAccessorR correct = create_1d_accessor_r_with_contents(
+    GenericTensorAccessorR correct = create_1d_accessor_r_with_contents<int32_t>(
         {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator);
 
     TensorShape result_shape = TensorShape{
         TensorDims{FFOrdered{3_n}},
-        DataType::FLOAT,
+        DataType::INT32,
     };
     GenericTensorAccessorW result =
         create_zero_filled_accessor_w(result_shape, cpu_allocator);
diff --git a/lib/kernels/test/src/cpu/ops/reverse_kernels.cc b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc
index db0016cb0b..8c54f4453b 100644
--- a/lib/kernels/test/src/cpu/ops/reverse_kernels.cc
+++ b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc
@@ -1,7 +1,9 @@
 #include "internal/test_utils.h"
 #include "kernels/format_accessor_contents.h"
 #include "kernels/reverse_kernels_cpu.h"
+#include "kernels/create_accessor_with_contents.h"
 #include <doctest/doctest.h>
+#include "test/utils/doctest/check_kv.h"
 
 using namespace ::FlexFlow;
 
@@ -9,7 +11,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Reverse::cpu_forward_kernel") {
     Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
-    GenericTensorAccessorR input = create_3d_accessor_r_with_contents(
+    GenericTensorAccessorR input = create_3d_accessor_r_with_contents<int32_t>(
         {
             {
                 {1, 3, 2},
@@ -25,7 +27,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     GenericTensorAccessorW result = create_zero_filled_accessor_w(
         TensorShape{
             TensorDims{FFOrdered{2_n, 2_n, 3_n}},
-            DataType::FLOAT,
+            DataType::INT32,
         },
         cpu_allocator);
 
@@ -34,7 +36,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*axis=*/ff_dim_t{0_n},
       };
 
-      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents(
+      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents<int32_t>(
           {
               {
                   {3, 3, 6},
@@ -50,8 +52,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
 
       CHECK_MESSAGE(accessors_are_equal(result, correct),
-                    "result=",
-                    format_accessor_w_contents(result));
+                    check_kv("result=", format_accessor_w_contents(result)));
     }
 
     SUBCASE("axis = ff_dim_t{1}") {
@@ -59,7 +60,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*axis=*/ff_dim_t{1_n},
       };
 
-      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents(
+      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents<int32_t>(
           {
               {
                   {4, 2, 1},
@@ -75,8 +76,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
 
       CHECK_MESSAGE(accessors_are_equal(result, correct),
-                    "result=",
-                    format_accessor_w_contents(result));
+                    check_kv("result", format_accessor_w_contents(result)));
     }
 
     SUBCASE("axis = ff_dim_t{2}") {
@@ -84,7 +84,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*axis=*/ff_dim_t{2_n},
       };
 
-      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents(
+      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents<int32_t>(
           {
               {
                   {2, 3, 1},
@@ -100,15 +100,14 @@ TEST_SUITE(FF_TEST_SUITE) {
       Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
 
       CHECK_MESSAGE(accessors_are_equal(result, correct),
-                    "result=",
-                    format_accessor_w_contents(result));
+                    check_kv("result", format_accessor_w_contents(result)));
     }
   }
 
   TEST_CASE("Reverse::cpu_backward_kernel") {
     Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
-    GenericTensorAccessorR input = create_3d_accessor_r_with_contents(
+    GenericTensorAccessorR input = create_3d_accessor_r_with_contents<int32_t>(
         {
             {
                 {1, 3, 2},
@@ -124,7 +123,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     GenericTensorAccessorW result = create_zero_filled_accessor_w(
         TensorShape{
             TensorDims{FFOrdered{2_n, 2_n, 3_n}},
-            DataType::FLOAT,
+            DataType::INT32,
         },
         cpu_allocator);
 
@@ -133,7 +132,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*axis=*/ff_dim_t{0_n},
       };
 
-      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents(
+      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents<int32_t>(
           {
               {
                   {3, 3, 6},
@@ -149,8 +148,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
 
       CHECK_MESSAGE(accessors_are_equal(result, correct),
-                    "result=",
-                    format_accessor_w_contents(result));
+                    check_kv("result", format_accessor_w_contents(result)));
     }
 
     SUBCASE("axis = ff_dim_t{1}") {
@@ -158,7 +156,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*axis=*/ff_dim_t{1_n},
       };
 
-      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents(
+      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents<int32_t>(
           {
               {
                   {4, 2, 1},
@@ -174,8 +172,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
 
       CHECK_MESSAGE(accessors_are_equal(result, correct),
-                    "result=",
-                    format_accessor_w_contents(result));
+                    check_kv("result", format_accessor_w_contents(result)));
     }
 
     SUBCASE("axis = ff_dim_t{2}") {
@@ -183,7 +180,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*axis=*/ff_dim_t{2_n},
       };
 
-      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents(
+      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents<int32_t>(
           {
               {
                   {2, 3, 1},
@@ -199,8 +196,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
 
       CHECK_MESSAGE(accessors_are_equal(result, correct),
-                    "result=",
-                    format_accessor_w_contents(result));
+                    check_kv("result", format_accessor_w_contents(result)));
     }
   }
 }
diff --git a/lib/kernels/test/src/internal/test_utils.cc b/lib/kernels/test/src/internal/test_utils.cc
index 0f34a6aa06..b20ea8ee6b 100644
--- a/lib/kernels/test/src/internal/test_utils.cc
+++ b/lib/kernels/test/src/internal/test_utils.cc
@@ -20,198 +20,6 @@ GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape,
   return read_only_accessor_from_write_accessor(accessor);
 }
 
-GenericTensorAccessorW
-    create_1d_accessor_w_with_contents(std::vector<float> const &contents,
-                                       Allocator &allocator) {
-  nonnegative_int ncols = num_elements(contents);
-  ASSERT(ncols > 0);
-
-  TensorShape shape = TensorShape{
-      TensorDims{FFOrdered{ncols}},
-      DataType::FLOAT,
-  };
-
-  Allocator cpu_allocator = create_local_cpu_memory_allocator();
-  GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape);
-
-  for (nonnegative_int col_idx : nonnegative_range(ncols)) {
-    cpu_accessor.at<DataType::FLOAT>(FFOrdered{col_idx}) =
-        contents.at(col_idx.unwrap_nonnegative());
-  }
-
-  GenericTensorAccessorW result = allocator.allocate_tensor(shape);
-  copy_accessor_data_to_l_from_r(
-      result, read_only_accessor_from_write_accessor(cpu_accessor));
-
-  return result;
-}
-
-GenericTensorAccessorW create_2d_accessor_w_with_contents(
-    std::vector<std::vector<float>> const &contents, Allocator &allocator) {
-  nonnegative_int nrows = num_elements(contents);
-  ASSERT(nrows > 0);
-
-  nonnegative_int ncols = throw_if_unexpected(
-      require_all_same1(transform(contents, [](std::vector<float> const &row) {
-        return num_elements(row);
-      })));
-  ASSERT(ncols > 0);
-
-  TensorShape shape = TensorShape{
-      TensorDims{FFOrdered{nrows, ncols}},
-      DataType::FLOAT,
-  };
-
-  Allocator cpu_allocator = create_local_cpu_memory_allocator();
-  GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape);
-
-  for (nonnegative_int row_idx : nonnegative_range(nrows)) {
-    for (nonnegative_int col_idx : nonnegative_range(ncols)) {
-      cpu_accessor.at<DataType::FLOAT>(FFOrdered{row_idx, col_idx}) =
-          contents.at(row_idx.unwrap_nonnegative())
-              .at(col_idx.unwrap_nonnegative());
-    }
-  }
-
-  GenericTensorAccessorW result = allocator.allocate_tensor(shape);
-  copy_accessor_data_to_l_from_r(
-      result, read_only_accessor_from_write_accessor(cpu_accessor));
-
-  return result;
-}
-
-GenericTensorAccessorW create_3d_accessor_w_with_contents(
-    std::vector<std::vector<std::vector<float>>> const &contents,
-    Allocator &allocator) {
-  nonnegative_int dim0_size = num_elements(contents);
-  ASSERT(dim0_size > 0);
-
-  nonnegative_int dim1_size = throw_if_unexpected(require_all_same1(
-      transform(contents, [](std::vector<std::vector<float>> const &m) {
-        return num_elements(m);
-      })));
-  ASSERT(dim1_size > 0);
-
-  nonnegative_int dim2_size = throw_if_unexpected(require_all_same1(
-      transform(contents, [](std::vector<std::vector<float>> const &m) {
-        return throw_if_unexpected(
-            require_all_same1(transform(m, [](std::vector<float> const &vec) {
-              return num_elements(vec);
-            })));
-      })));
-  ASSERT(dim2_size > 0);
-
-  TensorShape shape = TensorShape{
-      TensorDims{FFOrdered{dim0_size, dim1_size, dim2_size}},
-      DataType::FLOAT,
-  };
-
-  Allocator cpu_allocator = create_local_cpu_memory_allocator();
-  GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape);
-
-  for (nonnegative_int dim0_idx : nonnegative_range(dim0_size)) {
-    for (nonnegative_int dim1_idx : nonnegative_range(dim1_size)) {
-      for (nonnegative_int dim2_idx : nonnegative_range(dim2_size)) {
-        cpu_accessor.at<DataType::FLOAT>(
-            FFOrdered{dim0_idx, dim1_idx, dim2_idx}) =
-            contents.at(dim0_idx.unwrap_nonnegative())
-                .at(dim1_idx.unwrap_nonnegative())
-                .at(dim2_idx.unwrap_nonnegative());
-      }
-    }
-  }
-
-  GenericTensorAccessorW result = allocator.allocate_tensor(shape);
-  copy_accessor_data_to_l_from_r(
-      result, read_only_accessor_from_write_accessor(cpu_accessor));
-
-  return result;
-}
-
-GenericTensorAccessorW create_4d_accessor_w_with_contents(
-    std::vector<std::vector<std::vector<std::vector<float>>>> const &contents,
-    Allocator &allocator) {
-  nonnegative_int dim0_size = num_elements(contents);
-  ASSERT(dim0_size > 0);
-
-  nonnegative_int dim1_size = throw_if_unexpected(require_all_same1(transform(
-      contents, [](std::vector<std::vector<std::vector<float>>> const &t) {
-        return num_elements(t);
-      })));
-  ASSERT(dim1_size > 0);
-
-  nonnegative_int dim2_size = throw_if_unexpected(require_all_same1(transform(
-      contents, [](std::vector<std::vector<std::vector<float>>> const &m) {
-        return throw_if_unexpected(require_all_same1(
-            transform(m, [](std::vector<std::vector<float>> const &vec) {
-              return num_elements(vec);
-            })));
-      })));
-  ASSERT(dim2_size > 0);
-
-  nonnegative_int dim3_size = throw_if_unexpected(require_all_same1(transform(
-      contents, [](std::vector<std::vector<std::vector<float>>> const &t) {
-        return throw_if_unexpected(require_all_same1(
-            transform(t, [](std::vector<std::vector<float>> const &mat) {
-              return throw_if_unexpected(require_all_same1(
-                  transform(mat, [](std::vector<float> const &vec) {
-                    return num_elements(vec);
-                  })));
-            })));
-      })));
-  ASSERT(dim3_size > 0);
-
-  TensorShape shape = TensorShape{
-      TensorDims{FFOrdered{dim0_size, dim1_size, dim2_size, dim3_size}},
-      DataType::FLOAT,
-  };
-
-  GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
-
-  for (nonnegative_int dim0_idx : nonnegative_range(dim0_size)) {
-    for (nonnegative_int dim1_idx : nonnegative_range(dim1_size)) {
-      for (nonnegative_int dim2_idx : nonnegative_range(dim2_size)) {
-        for (nonnegative_int dim3_idx : nonnegative_range(dim3_size)) {
-          accessor.at<DataType::FLOAT>(
-              FFOrdered{dim0_idx, dim1_idx, dim2_idx, dim3_idx}) =
-              contents.at(dim0_idx.unwrap_nonnegative())
-                  .at(dim1_idx.unwrap_nonnegative())
-                  .at(dim2_idx.unwrap_nonnegative())
-                  .at(dim3_idx.unwrap_nonnegative());
-        }
-      }
-    }
-  }
-
-  return accessor;
-}
-
-GenericTensorAccessorR
-    create_1d_accessor_r_with_contents(std::vector<float> const &contents,
-                                       Allocator &allocator) {
-  return read_only_accessor_from_write_accessor(
-      create_1d_accessor_w_with_contents(contents, allocator));
-}
-
-GenericTensorAccessorR create_2d_accessor_r_with_contents(
-    std::vector<std::vector<float>> const &contents, Allocator &allocator) {
-  return read_only_accessor_from_write_accessor(
-      create_2d_accessor_w_with_contents(contents, allocator));
-}
-
-GenericTensorAccessorR create_3d_accessor_r_with_contents(
-    std::vector<std::vector<std::vector<float>>> const &contents,
-    Allocator &allocator) {
-  return read_only_accessor_from_write_accessor(
-      create_3d_accessor_w_with_contents(contents, allocator));
-}
-
-GenericTensorAccessorR create_4d_accessor_r_with_contents(
-    std::vector<std::vector<std::vector<std::vector<float>>>> const &contents,
-    Allocator &allocator) {
-  return read_only_accessor_from_write_accessor(
-      create_4d_accessor_w_with_contents(contents, allocator));
-}
 
 template <DataType DT>
 struct CreateRandomFilledAccessorW {
diff --git a/lib/kernels/test/src/internal/test_utils.h b/lib/kernels/test/src/internal/test_utils.h
index a4fc9b88c8..9147b667d6 100644
--- a/lib/kernels/test/src/internal/test_utils.h
+++ b/lib/kernels/test/src/internal/test_utils.h
@@ -29,32 +29,6 @@ GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape,
 GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape,
                                                      Allocator &allocator);
 
-GenericTensorAccessorW
-    create_1d_accessor_w_with_contents(std::vector<float> const &contents,
-                                       Allocator &allocator);
-GenericTensorAccessorR
-    create_1d_accessor_r_with_contents(std::vector<float> const &contents,
-                                       Allocator &allocator);
-
-GenericTensorAccessorW create_2d_accessor_w_with_contents(
-    std::vector<std::vector<float>> const &contents, Allocator &allocator);
-GenericTensorAccessorR create_2d_accessor_r_with_contents(
-    std::vector<std::vector<float>> const &contents, Allocator &allocator);
-
-GenericTensorAccessorW create_3d_accessor_w_with_contents(
-    std::vector<std::vector<std::vector<float>>> const &contents,
-    Allocator &allocator);
-GenericTensorAccessorR create_3d_accessor_r_with_contents(
-    std::vector<std::vector<std::vector<float>>> const &contents,
-    Allocator &allocator);
-
-GenericTensorAccessorW create_4d_accessor_w_with_contents(
-    std::vector<std::vector<std::vector<std::vector<float>>>> const &contents,
-    Allocator &allocator);
-GenericTensorAccessorR create_4d_accessor_r_with_contents(
-    std::vector<std::vector<std::vector<std::vector<float>>>> const &contents,
-    Allocator &allocator);
-
 bool contains_non_zero(GenericTensorAccessorR const &accessor);
 
 void fill_with_zeros(GenericTensorAccessorW const &accessor);
diff --git a/lib/kernels/test/src/kernels/array_coord.cc b/lib/kernels/test/src/kernels/array_coord.cc
new file mode 100644
index 0000000000..128b746a87
--- /dev/null
+++ b/lib/kernels/test/src/kernels/array_coord.cc
@@ -0,0 +1,44 @@
+#include <doctest/doctest.h>
+#include "kernels/array_coord.h"
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("array_coord_drop_dims") {
+    ArrayCoord coord = ArrayCoord{
+      FFOrdered{3_n, 5_n, 0_n, 1_n},
+    };
+
+    SUBCASE("removes dims specified to be dropped") {
+      std::function<bool(ff_dim_t)> should_drop_dim 
+        = [](ff_dim_t d) { return d.value % 2_n == 0_n; };
+
+      ArrayCoord result = array_coord_drop_dims(coord, should_drop_dim);
+      ArrayCoord correct = ArrayCoord{
+        FFOrdered{5_n, 1_n},
+      };
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("is identity function if no dimensions are specified to be dropped") {
+      std::function<bool(ff_dim_t)> should_drop_dim 
+        = [](ff_dim_t d) { return false; };
+
+      ArrayCoord result = array_coord_drop_dims(coord, should_drop_dim);
+      ArrayCoord correct = coord;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("returns empty coord if all dimensions are specified to be dropped") {
+      std::function<bool(ff_dim_t)> should_drop_dim 
+        = [](ff_dim_t d) { return true; };
+
+      ArrayCoord result = array_coord_drop_dims(coord, should_drop_dim);
+      ArrayCoord correct = ArrayCoord{FFOrdered<nonnegative_int>{}};
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/kernels/test/src/kernels/compare_tensor_accessors.cc b/lib/kernels/test/src/kernels/compare_tensor_accessors.cc
new file mode 100644
index 0000000000..d5124180af
--- /dev/null
+++ b/lib/kernels/test/src/kernels/compare_tensor_accessors.cc
@@ -0,0 +1,57 @@
+#include "internal/test_utils.h"
+#include <doctest/doctest.h>
+#include "kernels/compare_tensor_accessors.h"
+#include "kernels/create_accessor_with_contents.h"
+#include "kernels/format_accessor_contents.h"
+#include "test/utils/doctest/check_kv.h"
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("compare_tensor_accessors_lt") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorR lhs = create_3d_accessor_r_with_contents<float>(
+        {
+          {
+            {1, 3, 2},
+            {4, 2, 1},
+          },
+          {
+            {3, 3, 6},
+            {2, 1, 5},
+          },
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorR rhs = create_3d_accessor_r_with_contents<float>(
+        {
+          {
+            {2, 3, 3},
+            {5, 1, 0},
+          },
+          {
+            {1, 5, 4},
+            {2, 1, 5},
+          },
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorW result = compare_tensor_accessors_lt(lhs, rhs, cpu_allocator);
+    GenericTensorAccessorR correct = create_3d_accessor_r_with_contents<bool>(
+        {
+          {
+            {true, false, true},
+            {true, false, false},
+          },
+          {
+            {false, true, false},
+            {false, false, false},
+          },
+        },
+        cpu_allocator);
+
+    CHECK_MESSAGE(accessors_are_equal(result, correct),
+                  check_kv("result", format_accessor_w_contents(result)));
+  }
+}
diff --git a/lib/kernels/test/src/kernels/format_accessor_contents.cc b/lib/kernels/test/src/kernels/format_accessor_contents.cc
index 915a84c335..a7f2bed5ba 100644
--- a/lib/kernels/test/src/kernels/format_accessor_contents.cc
+++ b/lib/kernels/test/src/kernels/format_accessor_contents.cc
@@ -1,6 +1,7 @@
 #include "kernels/format_accessor_contents.h"
 #include "internal/test_utils.h"
 #include "kernels/local_cpu_allocator.h"
+#include "kernels/create_accessor_with_contents.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
@@ -11,7 +12,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("accessor is 1d") {
       GenericTensorAccessorR accessor =
-          create_1d_accessor_r_with_contents({1, 2, 3, 2}, cpu_allocator);
+          create_1d_accessor_r_with_contents<int32_t>({1, 2, 3, 2}, cpu_allocator);
 
       std::string correct = "[1 2 3 2]";
 
@@ -21,7 +22,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("accessor is 2d") {
-      GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents(
+      GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents<int32_t>(
           {
               {1, 2, 3, 5},
               {4, 3, 3, 2},
@@ -41,7 +42,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("accessor is 3d") {
-      GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents(
+      GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents<int32_t>(
           {
               {
                   {1, 2, 3, 6},
@@ -86,7 +87,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("accessor is some other dimension") {
       GenericTensorAccessorR accessor =
-          create_4d_accessor_r_with_contents({{{{5}}}}, cpu_allocator);
+          create_4d_accessor_r_with_contents<int32_t>({{{{5}}}}, cpu_allocator);
 
       CHECK_THROWS(format_accessor_r_contents(accessor));
     }
diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc
index f89121f5c6..3a0f4ffdc4 100644
--- a/lib/kernels/test/src/test_attention_kernel.cc
+++ b/lib/kernels/test/src/test_attention_kernel.cc
@@ -19,7 +19,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     nonnegative_int kvSeqLength = 20_n;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+      /*workSpaceSize=*/1024 * 1024,
+      /*allowTensorOpMathConversion=*/true
+    );
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc
index 2d98736c38..e10a80b57f 100644
--- a/lib/kernels/test/src/test_batch_matmul_kernel.cc
+++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc
@@ -15,7 +15,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     int seq_length = -1;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+      /*workSpaceSize=*/1024 * 1024,
+      /*allowTensorOpMathConversion=*/true
+    );
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc
index 86c0b7a685..c9a1bf05e6 100644
--- a/lib/kernels/test/src/test_batch_norm_kernel.cc
+++ b/lib/kernels/test/src/test_batch_norm_kernel.cc
@@ -5,7 +5,7 @@
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test BatchNorm Kernel") {
     nonnegative_int output_n = 1_n;
     nonnegative_int output_c = 10_n;
@@ -13,7 +13,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     nonnegative_int output_w = 10_n;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+      /*workSpaceSize=*/1024 * 1024,
+      /*allowTensorOpMathConversion=*/true
+    );
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc
index 2040dcbd5d..ddcb0d8c49 100644
--- a/lib/kernels/test/src/test_combine_kernel.cc
+++ b/lib/kernels/test/src/test_combine_kernel.cc
@@ -6,9 +6,10 @@
 using namespace ::FlexFlow;
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Call Combine Forward and Backward Kernels") {
-    ManagedPerDeviceFFHandle managed_handle{
-        /*workSpaceSize=*/1024 * 1024,
-        /*allowTensorOpMathConversion=*/true};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+      /*workSpaceSize=*/1024 * 1024,
+      /*allowTensorOpMathConversion=*/true
+    );
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc
index c8d74c32ab..20ebb52161 100644
--- a/lib/kernels/test/src/test_concat_kernel.cc
+++ b/lib/kernels/test/src/test_concat_kernel.cc
@@ -6,7 +6,10 @@
 using namespace ::FlexFlow;
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test concat kernel forward and backward") {
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+      /*workSpaceSize=*/1024 * 1024,
+      /*allowTensorOpMathConversion=*/true
+    );
     ManagedFFStream managed_stream{};
     Allocator allocator = create_local_cuda_memory_allocator();
 
diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc
index 3a7ce8fac1..8379e062d5 100644
--- a/lib/kernels/test/src/test_dropout.cc
+++ b/lib/kernels/test/src/test_dropout.cc
@@ -20,7 +20,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     TensorShape output_shape = input_shape;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+      /*workSpaceSize=*/1024 * 1024,
+      /*allowTensorOpMathConversion=*/true
+    );
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc
index afe3e9793d..dd44b8f50c 100644
--- a/lib/kernels/test/src/test_flat_kernel.cc
+++ b/lib/kernels/test/src/test_flat_kernel.cc
@@ -8,7 +8,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Flat Kernel") {
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+      /*workSpaceSize=*/1024 * 1024,
+      /*allowTensorOpMathConversion=*/true
+    );
     ManagedFFStream managed_stream{};
 
     TensorShape input_shape = TensorShape{
diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc
index 1ed64020ec..c387899709 100644
--- a/lib/kernels/test/src/test_gather_kernels.cc
+++ b/lib/kernels/test/src/test_gather_kernels.cc
@@ -6,7 +6,10 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Gather Forward and Backward Kernel") {
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+      /*workSpaceSize=*/1024 * 1024,
+      /*allowTensorOpMathConversion=*/true
+    );
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc
index b8d9d725cf..eb62784369 100644
--- a/lib/kernels/test/src/test_layer_norm_kernels.cc
+++ b/lib/kernels/test/src/test_layer_norm_kernels.cc
@@ -22,7 +22,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         DataType::FLOAT,
     };
 
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+      /*workSpaceSize=*/1024 * 1024,
+      /*allowTensorOpMathConversion=*/true
+    );
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc
index fb5920adcc..9243601766 100644
--- a/lib/kernels/test/src/test_managed_ff_stream.cc
+++ b/lib/kernels/test/src/test_managed_ff_stream.cc
@@ -6,9 +6,10 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test ManagedFFStream") {
-    ManagedPerDeviceFFHandle managed_handle{
-        /*workSpaceSize=*/1024 * 1024,
-        /*allowTensorOpMathConversion=*/true};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+      /*workSpaceSize=*/1024 * 1024,
+      /*allowTensorOpMathConversion=*/true
+    );
     ManagedFFStream managed_stream{};
     Allocator allocator = create_local_cuda_memory_allocator();
 
diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
index fc67764cdb..058622e5cb 100644
--- a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
+++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
@@ -5,8 +5,12 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test ManagedPerDeviceFFHandle") {
-    ManagedPerDeviceFFHandle base_handle{/*workSpaceSize=*/1024 * 1024,
-                                         /*allowTensorOpMathConversion=*/true};
+    ManagedPerDeviceFFHandle base_handle{
+      /*num_ranks=*/1,
+      /*my_rank=*/0,
+      /*workSpaceSize=*/1024 * 1024,
+      /*allowTensorOpMathConversion=*/true,
+    };
     PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle();
 
     SUBCASE("constructor") {
@@ -22,8 +26,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     SUBCASE("move assignment operator") {
       SUBCASE("move assign to other") {
         ManagedPerDeviceFFHandle new_handle{
-            /*workSpaceSize=*/1024 * 1024,
-            /*allowTensorOpMathConversion=*/true};
+          /*num_ranks=*/1,
+          /*my_rank=*/0,
+          /*workSpaceSize=*/1024 * 1024,
+          /*allowTensorOpMathConversion=*/true,
+        };
         new_handle = std::move(base_handle);
         CHECK(&new_handle.raw_handle() == base_handle_ptr);
       }
diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc
index b32368eb29..283b465abc 100644
--- a/lib/kernels/test/src/test_partition_kernel.cc
+++ b/lib/kernels/test/src/test_partition_kernel.cc
@@ -7,7 +7,10 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Partition Forward and Backward") {
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+      /*workSpaceSize=*/1024 * 1024,
+      /*allowTensorOpMathConversion=*/true
+    );
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc
index 4098456b8d..ceca1d94dd 100644
--- a/lib/kernels/test/src/test_pool_2d_kernels.cc
+++ b/lib/kernels/test/src/test_pool_2d_kernels.cc
@@ -23,7 +23,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     PoolOp pool_type = PoolOp::MAX;
 
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+      /*workSpaceSize=*/1024 * 1024,
+      /*allowTensorOpMathConversion=*/true
+    );
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc
index 16d8556bb3..b7990d84fa 100644
--- a/lib/kernels/test/src/test_reduction_kernel.cc
+++ b/lib/kernels/test/src/test_reduction_kernel.cc
@@ -13,7 +13,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         DataType::FLOAT,
     };
 
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+      /*workSpaceSize=*/1024 * 1024,
+      /*allowTensorOpMathConversion=*/true
+    );
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index 69dbe672ac..ceb0915c03 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -1,4 +1,5 @@
 #include "internal/test_utils.h"
+#include "kernels/create_accessor_with_contents.h"
 #include "kernels/format_accessor_contents.h"
 #include "kernels/replicate_kernels.h"
 #include "kernels/replicate_kernels_cpu.h"
@@ -20,7 +21,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         DataType::FLOAT,
     };
 
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+      /*workSpaceSize=*/1024 * 1024,
+      /*allowTensorOpMathConversion=*/true
+    );
     ManagedFFStream managed_stream{};
 
     Allocator gpu_allocator = create_local_cuda_memory_allocator();
@@ -28,7 +32,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input =
-          create_1d_accessor_r_with_contents({1, 3, 2}, gpu_allocator);
+          create_1d_accessor_r_with_contents<int32_t>({1, 3, 2}, gpu_allocator);
 
       GenericTensorAccessorW output =
           gpu_allocator.allocate_tensor(output_shape);
@@ -43,7 +47,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorR output_grad = create_2d_accessor_r_with_contents(
+      GenericTensorAccessorR output_grad = create_2d_accessor_r_with_contents<int32_t>(
           {
               {1, 2, 3},
               {4, 3, 3},
@@ -51,7 +55,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
           },
           gpu_allocator);
 
-      GenericTensorAccessorR correct = create_1d_accessor_r_with_contents(
+      GenericTensorAccessorR correct = create_1d_accessor_r_with_contents<int32_t>(
           {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator);
 
       GenericTensorAccessorW input_grad =
@@ -80,9 +84,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         DataType::FLOAT,
     };
 
-    ManagedPerDeviceFFHandle managed_handle{
-        /*workSpaceSize=*/1024 * 1024,
-        /*allowTensorOpMathConversion=*/true};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+      /*workSpaceSize=*/1024 * 1024,
+      /*allowTensorOpMathConversion=*/true
+    );
     ManagedFFStream managed_stream{};
 
     Allocator gpu_allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc
index c63a69f76e..69f0a1f214 100644
--- a/lib/kernels/test/src/test_reshape_kernel.cc
+++ b/lib/kernels/test/src/test_reshape_kernel.cc
@@ -5,7 +5,10 @@
 using namespace ::FlexFlow;
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Reshape Forward and Backward") {
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+      /*workSpaceSize=*/1024 * 1024,
+      /*allowTensorOpMathConversion=*/true
+    );
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index 87ac4e6713..f2ddb2c67b 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -13,7 +13,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     };
     TensorShape output_shape = input_shape;
 
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+      /*workSpaceSize=*/1024 * 1024,
+      /*allowTensorOpMathConversion=*/true
+    );
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
@@ -57,9 +60,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     };
     TensorShape output_shape = input_shape;
 
-    ManagedPerDeviceFFHandle managed_handle{
-        /*workSpaceSize=*/1024 * 1024,
-        /*allowTensorOpMathConversion=*/true};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+      /*workSpaceSize=*/1024 * 1024,
+      /*allowTensorOpMathConversion=*/true
+    );
     ManagedFFStream managed_stream{};
 
     Allocator gpu_allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc
index e2a220d24a..0d5dcb79a2 100644
--- a/lib/kernels/test/src/test_softmax_kernel.cc
+++ b/lib/kernels/test/src/test_softmax_kernel.cc
@@ -12,7 +12,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     nonnegative_int input_w = 100_n;
     nonnegative_int channels = 100_n;
 
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+      /*workSpaceSize=*/1024 * 1024,
+      /*allowTensorOpMathConversion=*/true
+    );
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc
index a623946972..d8ddb8c4b9 100644
--- a/lib/kernels/test/src/test_split_kernel.cc
+++ b/lib/kernels/test/src/test_split_kernel.cc
@@ -13,7 +13,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     coord_t in_blk_size = 100;
     coord_t num_blks = 1;
 
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+      /*workSpaceSize=*/1024 * 1024,
+      /*allowTensorOpMathConversion=*/true
+    );
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc
index b5f80956fa..e2042c1e2c 100644
--- a/lib/kernels/test/src/test_transpose_kernel.cc
+++ b/lib/kernels/test/src/test_transpose_kernel.cc
@@ -12,7 +12,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         },
     };
 
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+      /*workSpaceSize=*/1024 * 1024,
+      /*allowTensorOpMathConversion=*/true
+    );
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h
index 2deed6b0a2..6f8f4b1543 100644
--- a/lib/local-execution/include/local-execution/model_training_instance.h
+++ b/lib/local-execution/include/local-execution/model_training_instance.h
@@ -30,7 +30,7 @@ struct ModelTrainingInstance {
   PerLayerElapsedTime forward();
   PerLayerElapsedTime backward();
   void update();
-  void write_loss_tensor_to_host(float *host_ptr);
+  GenericTensorAccessorR get_loss_tensor_accessor() const;
 };
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc
index e58b5dfe7d..790c5e8e18 100644
--- a/lib/local-execution/src/model_training_instance.cc
+++ b/lib/local-execution/src/model_training_instance.cc
@@ -54,14 +54,14 @@ void ModelTrainingInstance::update() {
       get_optimizer_attrs_for_next_iter(this->optimizer_attrs);
 }
 
-void ModelTrainingInstance::write_loss_tensor_to_host(float *host_ptr) {
+GenericTensorAccessorR ModelTrainingInstance::get_loss_tensor_accessor() const {
   gradient_tensor_t loss_tensor =
       this->training_backing.local_tensor_backing.tensor_gradient_mapping.at(
           this->logit_tensor);
   GenericTensorAccessorW loss_tensor_backing =
       this->training_backing.local_tensor_backing.tensor_backings.at(
           TensorTypeVariant{loss_tensor});
-  write_to_host_float_ptr(loss_tensor_backing, host_ptr);
+  return read_only_accessor_from_write_accessor(loss_tensor_backing);
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc
index b527430d67..6dabe09799 100644
--- a/lib/local-execution/test/src/test_e2e.cc
+++ b/lib/local-execution/test/src/test_e2e.cc
@@ -1,3 +1,5 @@
+#include "kernels/copy_tensor_accessor.h"
+#include "kernels/local_cpu_allocator.h"
 #include "kernels/local_cuda_allocator.h"
 #include "kernels/managed_ff_stream.h"
 #include "kernels/managed_per_device_ff_handle.h"
@@ -14,7 +16,10 @@
 
 using namespace ::FlexFlow;
 
-bool did_loss_decrease(float *first_epoch, float *last_epoch, int batch_size) {
+bool did_loss_decrease(
+  GenericTensorAccessorR const &first_epoch, 
+  GenericTensorAccessorR const &last_epoch
+) {
   for (int i = 0; i < batch_size; i++) {
     if (first_epoch[i] < last_epoch[i]) {
       return false;
@@ -27,7 +32,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("LocalBackend e2e Training") {
     // initialize runtime
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+      /*workSpaceSize=*/1024 * 1024,
+      /*allowTensorOpMathConversion=*/true
+    );
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
@@ -146,28 +154,26 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
                               loss_attrs,
                               optimizer_attrs};
 
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
     int num_epochs = 5;
-    int num_samples = batch_size.unwrap_nonnegative();
-    std::vector<float *> loss_values(num_epochs);
+    std::vector<GenericTensorAccessorR> loss_values;
 
     for (int i = 0; i < num_epochs; i++) {
       model_training_instance.forward();
       model_training_instance.backward();
       model_training_instance.update();
-      float *host_loss_ptr = new float[num_samples];
-      model_training_instance.write_loss_tensor_to_host(host_loss_ptr);
-      loss_values[i] = host_loss_ptr;
+      loss_values.push_back(
+        copy_tensor_accessor_r(
+          model_training_instance.get_loss_tensor_accessor(),
+          cpu_allocator));
     }
 
     // Assert that each sample in the batch has a lower loss in last epoch than
     // the first epoch
-    float *first_epoch = loss_values[0];
-    float *last_epoch = loss_values[num_epochs - 1];
+    GenericTensorAccessorR first_epoch_loss = loss_values.at(0);
+    GenericTensorAccessorR last_epoch = loss_values.back();
     CHECK(did_loss_decrease(
-        first_epoch, last_epoch, batch_size.unwrap_nonnegative()));
-
-    for (int i = 0; i < num_epochs; i++) {
-      delete[] loss_values[i];
-    }
+        first_epoch_loss, last_epoch, batch_size.unwrap_nonnegative()));
   }
 }
diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc
index c9c5afe04e..4d015f4cfa 100644
--- a/lib/local-execution/test/src/test_local_cost_estimator.cc
+++ b/lib/local-execution/test/src/test_local_cost_estimator.cc
@@ -11,8 +11,10 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("LocalCostEstimator") {
-    // local backing initialization
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+      /*workSpaceSize=*/1024 * 1024,
+      /*allowTensorOpMathConversion=*/true
+    );
 
     RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
         DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
diff --git a/lib/local-execution/test/src/test_loss_functions.cc b/lib/local-execution/test/src/test_loss_functions.cc
index ca2482653b..e8f48413b6 100644
--- a/lib/local-execution/test/src/test_loss_functions.cc
+++ b/lib/local-execution/test/src/test_loss_functions.cc
@@ -17,7 +17,10 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("LossFunctions") {
     // initialize runtime
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+      /*workSpaceSize=*/1024 * 1024,
+      /*allowTensorOpMathConversion=*/true
+    );
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
diff --git a/lib/local-execution/test/src/test_update.cc b/lib/local-execution/test/src/test_update.cc
index 75ba517d1b..18509d1fd9 100644
--- a/lib/local-execution/test/src/test_update.cc
+++ b/lib/local-execution/test/src/test_update.cc
@@ -15,7 +15,10 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ExecuteUpdate") {
     // initialize runtime configs
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+      /*workSpaceSize=*/1024 * 1024,
+      /*allowTensorOpMathConversion=*/true
+    );
 
     Allocator allocator = create_local_cuda_memory_allocator();
     AllocatedTensors allocated_tensors = make_empty_allocated_tensors();
diff --git a/lib/op-attrs/include/op-attrs/datatype.h b/lib/op-attrs/include/op-attrs/datatype.h
index e17f51b73a..9996e36482 100644
--- a/lib/op-attrs/include/op-attrs/datatype.h
+++ b/lib/op-attrs/include/op-attrs/datatype.h
@@ -13,34 +13,58 @@ template <DataType>
 struct data_type_enum_to_class;
 
 template <>
-struct data_type_enum_to_class<DataType::FLOAT> {
-  using type = float;
-};
+struct data_type_enum_to_class<DataType::FLOAT>
+  : type_identity<float> {};
 
 template <>
-struct data_type_enum_to_class<DataType::DOUBLE> {
-  using type = double;
-};
+struct data_type_enum_to_class<DataType::DOUBLE>
+  : type_identity<double> {};
 
 template <>
-struct data_type_enum_to_class<DataType::INT32> {
-  using type = int32_t;
-};
+struct data_type_enum_to_class<DataType::INT32>
+  : type_identity<int32_t> {};
 
 template <>
-struct data_type_enum_to_class<DataType::INT64> {
-  using type = int64_t;
-};
+struct data_type_enum_to_class<DataType::INT64>
+  : type_identity<int64_t> {};
 
 template <>
-struct data_type_enum_to_class<DataType::HALF> {
-  using type = half;
-};
+struct data_type_enum_to_class<DataType::HALF> 
+  : type_identity<half> {};
 
 template <>
-struct data_type_enum_to_class<DataType::BOOL> {
-  using type = bool;
-};
+struct data_type_enum_to_class<DataType::BOOL>
+  : type_identity<bool> {};
+
+template <typename T>
+struct type_to_data_type_enum;
+
+template <>
+struct type_to_data_type_enum<DataType>
+  : std::integral_constant<DataType, DataType::DOUBLE> {};
+
+template <>
+struct type_to_data_type_enum<float>
+  : std::integral_constant<DataType, DataType::FLOAT> {};
+
+template <>
+struct type_to_data_type_enum<half>
+  : std::integral_constant<DataType, DataType::HALF> {};
+
+template <>
+struct type_to_data_type_enum<int32_t>
+  : std::integral_constant<DataType, DataType::INT32> {};
+
+template <>
+struct type_to_data_type_enum<int64_t>
+  : std::integral_constant<DataType, DataType::INT64> {};
+
+template <>
+struct type_to_data_type_enum<bool>
+  : std::integral_constant<DataType, DataType::BOOL> {};
+
+template <typename T>
+inline constexpr DataType type_to_data_type_enum_v = type_to_data_type_enum<T>::value;
 
 template <DataType DT, typename T>
 typename data_type_enum_to_class<DT>::type cast_to(T t) {

From d1ffea9fd00d35da4b9f5e9b943d06cb25aaf8e2 Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Wed, 14 May 2025 18:19:32 -0700
Subject: [PATCH 72/91] Fix typo in task-spec

---
 .proj.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.proj.toml b/.proj.toml
index 3a120ca553..b14d763339 100644
--- a/.proj.toml
+++ b/.proj.toml
@@ -56,7 +56,7 @@ has-cpu-only-benchmarks = false
 has-cuda-tests = false
 has-cuda-benchmarks = false
 
-[targets.task_spec]
+[targets.task-spec]
 type = "lib"
 has-cpu-only-tests = true
 has-cpu-only-benchmarks = false

From 7e45215be7b49d3a5f10140b5732b6e7d6bca658 Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Wed, 21 May 2025 08:18:08 +0000
Subject: [PATCH 73/91] Add positive_int and tensor reductions/comparisons

---
 .../src/compiler/allowed_machine_views.cc     |  30 +-
 .../get_machine_resource_splits.cc            |   8 +-
 .../test/src/allowed_machine_views.cc         |  40 +-
 ...racted_tensor_set_movement_across_split.cc |   8 +-
 .../get_machine_resource_splits.cc            | 190 +++----
 .../get_optimal_machine_mapping.cc            |  22 +-
 .../get_tensor_set_movement_across_split.cc   |  16 +-
 .../machine_mapping/machine_mapping.cc        |   8 +-
 .../get_machine_mapping_problem_tree.cc       |   6 +-
 .../machine_mapping/machine_mapping_result.cc |  12 +-
 ...get_optimal_machine_mapping_with_memory.cc |  22 +-
 .../machine_mapping_result_with_memory.cc     |  20 +-
 ...ion_graph_series_parallel_decomposition.cc |  22 +-
 .../get_pcg_series_parallel_decomposition.cc  |  32 +-
 .../task_graph_simulator/task_simulator.cc    |  34 +-
 lib/compiler/test/src/graph_optimize_state.cc |  24 +-
 lib/kernels/include/kernels/accessor.h        |  11 +-
 lib/kernels/include/kernels/array_coord.h     |   2 +-
 lib/kernels/include/kernels/array_shape.h     |  34 +-
 .../kernels/create_accessor_with_contents.h   |  63 +--
 .../include/kernels/fill_tensor_accessor.h    |  22 +
 lib/kernels/include/kernels/legion_dim.h      |   3 +-
 .../include/kernels/map_tensor_accessors.h    |  51 +-
 .../include/kernels/optimizer_kernels.h       |   4 +-
 .../include/kernels/reduce_tensor_accessor.h  |  88 +++
 .../include/kernels/reduce_tensor_accessors.h |  39 --
 .../reverse_kernels_params.struct.toml        |  10 +-
 .../kernels/tensor_accessor_reductions.h      |  13 +
 lib/kernels/src/cpu/ops/cast_kernels.cc       |   4 +-
 lib/kernels/src/cpu/ops/combine_kernels.cc    |   6 +-
 .../src/cpu/ops/initializer_kernels.cc        |   4 +-
 lib/kernels/src/cpu/ops/replicate_kernels.cc  |  12 +-
 lib/kernels/src/cpu/ops/reverse_kernels.cc    |   4 +-
 lib/kernels/src/cuda/cuda_helper.cu           |   8 +-
 lib/kernels/src/cuda/embedding_kernels.cu     |  48 +-
 lib/kernels/src/cuda/ops/cast_kernels.cu      |   4 +-
 lib/kernels/src/cuda/ops/combine_kernels.cu   |   6 +-
 lib/kernels/src/cuda/ops/concat_kernels.cu    |   4 +-
 lib/kernels/src/cuda/ops/conv_2d_kernels.cu   |  16 +-
 .../src/cuda/ops/element_unary_kernels.cu     |   8 +-
 lib/kernels/src/cuda/ops/flat_kernels.cu      |   6 +-
 lib/kernels/src/cuda/ops/gather_kernels.cu    |  16 +-
 lib/kernels/src/cuda/ops/partition_kernels.cu |   8 +-
 lib/kernels/src/cuda/ops/reduction_kernels.cu |   8 +-
 lib/kernels/src/cuda/ops/replicate_kernels.cu |   8 +-
 lib/kernels/src/cuda/ops/reshape_kernels.cu   |   8 +-
 lib/kernels/src/cuda/ops/reverse_kernels.cu   |  16 +-
 lib/kernels/src/cuda/ops/transpose_kernels.cu |  16 +-
 lib/kernels/src/cuda/optimizer_kernels.cu     |   2 +-
 lib/kernels/src/kernels/accessor.cc           |  65 ++-
 lib/kernels/src/kernels/allocation.cc         |   2 +-
 lib/kernels/src/kernels/array_shape.cc        |  57 +-
 .../src/kernels/compare_tensor_accessors.cc   |  36 +-
 .../src/kernels/fill_tensor_accessor.cc       |  26 +
 .../src/kernels/format_accessor_contents.cc   |  25 +-
 .../src/kernels/map_tensor_accessors.cc       |  13 +-
 .../src/kernels/reduce_tensor_accessor.cc     |  17 +
 .../src/kernels/reverse_kernels_params.cc     |  10 +-
 .../src/kernels/tensor_accessor_reductions.cc |  27 +
 lib/kernels/test/CMakeLists.txt               |   7 -
 lib/kernels/test/modify_test_commands.cmake   |  21 -
 .../test/src/cpu/ops/replicate_kernels.cc     |   4 +-
 .../test/src/cpu/ops/reverse_kernels.cc       |   4 +-
 lib/kernels/test/src/internal/test_utils.cc   |  12 +-
 lib/kernels/test/src/kernels/accessor.cc      |  70 ++-
 lib/kernels/test/src/kernels/array_shape.cc   |  63 ++-
 .../src/kernels/compare_tensor_accessors.cc   | 163 ++++++
 .../kernels/create_accessor_with_contents.cc  | 133 +++++
 .../test/src/kernels/map_tensor_accessors.cc  | 151 +++++
 .../src/kernels/reduce_tensor_accessor.cc     |  68 +++
 .../src/kernels/tensor_accessor_reductions.cc | 106 ++++
 lib/kernels/test/src/test_attention_kernel.cc |  46 +-
 .../test/src/test_batch_matmul_kernel.cc      |  24 +-
 .../test/src/test_batch_norm_kernel.cc        |  18 +-
 lib/kernels/test/src/test_cast_kernel.cc      |   8 +-
 lib/kernels/test/src/test_combine_kernel.cc   |   4 +-
 lib/kernels/test/src/test_concat_kernel.cc    |  30 +-
 lib/kernels/test/src/test_dropout.cc          |   4 +-
 lib/kernels/test/src/test_flat_kernel.cc      |   2 +-
 lib/kernels/test/src/test_gather_kernels.cc   |  18 +-
 .../test/src/test_layer_norm_kernels.cc       |   8 +-
 .../test/src/test_managed_ff_stream.cc        |  18 +-
 lib/kernels/test/src/test_partition_kernel.cc |   2 +-
 lib/kernels/test/src/test_pool_2d_kernels.cc  |  48 +-
 lib/kernels/test/src/test_reduction_kernel.cc |   4 +-
 lib/kernels/test/src/test_replicate_kernel.cc |  14 +-
 lib/kernels/test/src/test_reshape_kernel.cc   |   2 +-
 lib/kernels/test/src/test_reverse_kernels.cc  |   4 +-
 lib/kernels/test/src/test_softmax_kernel.cc   |   4 +-
 lib/kernels/test/src/test_split_kernel.cc     |   4 +-
 lib/kernels/test/src/test_transpose_kernel.cc |   2 +-
 lib/local-execution/src/loss_functions.cc     |  54 +-
 lib/local-execution/src/optimizer.cc          |  26 +-
 .../test/src/test_allocated_tensors.cc        |   8 +-
 lib/local-execution/test/src/test_e2e.cc      |  30 +-
 .../test/src/test_local_cost_estimator.cc     |  16 +-
 .../test/src/test_local_slots_backing.cc      | 276 ---------
 .../test/src/test_local_task_arg_accessor.cc  |  12 +-
 .../test/src/test_local_tensor_backing.cc     |   6 +-
 .../test/src/test_loss_functions.cc           |  16 +-
 .../test/src/test_task_registry.cc            |   8 +-
 .../test/src/test_unallocated_tensors.cc      |   8 +-
 lib/local-execution/test/src/test_update.cc   |  12 +-
 .../models/bert/bert_config.struct.toml       |  16 +-
 .../candle_uno/candle_uno_config.struct.toml  |  10 +-
 .../models/dlrm/dlrm_config.struct.toml       |  14 +-
 .../inception_v3_config.struct.toml           |   6 +-
 .../include/models/split_test/split_test.h    |   2 +-
 .../transformer_config.struct.toml            |  18 +-
 lib/models/src/models/bert/bert.cc            |  22 +-
 .../src/models/candle_uno/candle_uno.cc       |  24 +-
 lib/models/src/models/dlrm/dlrm.cc            |  44 +-
 .../src/models/inception_v3/inception_v3.cc   | 530 +++++++++---------
 .../src/models/split_test/split_test.cc       |  12 +-
 .../src/models/transformer/transformer.cc     |  26 +-
 lib/op-attrs/include/op-attrs/datatype.h      |   6 +-
 .../include/op-attrs/datatype_value.h         |   3 +
 .../initializers/kaiming_initializer_mode.h   |   4 +-
 lib/op-attrs/include/op-attrs/ops/attention.h |  36 +-
 .../multihead_attention_inputs.struct.toml    |  12 +-
 .../op-attrs/ops/attention_attrs.struct.toml  |  10 +-
 .../op-attrs/ops/combine_attrs.struct.toml    |   4 +-
 .../conv_2d/conv_2d_input_shape.struct.toml   |  10 +-
 .../conv_2d_parallel_input_shape.struct.toml  |   6 +-
 .../op-attrs/ops/conv_2d_attrs.struct.toml    |  13 +-
 .../op-attrs/ops/embedding_attrs.struct.toml  |   6 +-
 .../op-attrs/ops/linear_attrs.struct.toml     |   4 +-
 lib/op-attrs/include/op-attrs/ops/pool_2d.h   |   4 +-
 .../op-attrs/ops/pool_2d_attrs.struct.toml    |   9 +-
 .../op-attrs/ops/reduction_attrs.struct.toml  |   4 +-
 .../ops/repartition_attrs.struct.toml         |   4 +-
 .../op-attrs/ops/replicate_attrs.struct.toml  |   4 +-
 .../op-attrs/ops/topk_attrs.struct.toml       |   4 +-
 .../parallel_tensor_dim_degrees.struct.toml   |   4 +-
 .../include/op-attrs/parallel_tensor_dims.h   |  10 +-
 .../include/op-attrs/parallel_tensor_shape.h  |  14 +-
 .../discard_copy_degree.struct.toml           |   4 +-
 .../sum_degree.struct.toml                    |   4 +-
 .../op-attrs/replica_parallel_dim.struct.toml |   4 +-
 .../op-attrs/replica_parallel_dim_set.h       |   3 +-
 .../op-attrs/shard_parallel_dim.struct.toml   |   6 +-
 lib/op-attrs/include/op-attrs/tensor_dims.h   |   8 +-
 .../include/op-attrs/tensor_dims.struct.toml  |   4 +-
 lib/op-attrs/include/op-attrs/tensor_shape.h  |   8 +-
 lib/op-attrs/src/op-attrs/datatype.cc         |  14 +-
 lib/op-attrs/src/op-attrs/datatype_value.cc   |  11 +
 .../src/op-attrs/initializer_attrs.cc         |  14 +-
 .../initializers/kaiming_initializer_mode.cc  |   8 +-
 lib/op-attrs/src/op-attrs/ops/attention.cc    |  86 +--
 .../attention/multihead_attention_inputs.cc   |  18 +-
 .../multihead_attention_parallel_inputs.cc    |   6 +-
 lib/op-attrs/src/op-attrs/ops/batch_matmul.cc |  21 +-
 lib/op-attrs/src/op-attrs/ops/batch_norm.cc   |  18 +-
 lib/op-attrs/src/op-attrs/ops/combine.cc      |   6 +-
 lib/op-attrs/src/op-attrs/ops/concat.cc       |  14 +-
 lib/op-attrs/src/op-attrs/ops/conv_2d.cc      |  47 +-
 .../ops/conv_2d/conv_2d_input_shape.cc        |   8 +-
 lib/op-attrs/src/op-attrs/ops/embedding.cc    |  16 +-
 lib/op-attrs/src/op-attrs/ops/flat.cc         |  10 +-
 lib/op-attrs/src/op-attrs/ops/layer_norm.cc   |   6 +-
 lib/op-attrs/src/op-attrs/ops/linear.cc       |  22 +-
 lib/op-attrs/src/op-attrs/ops/pool_2d.cc      |  50 +-
 lib/op-attrs/src/op-attrs/ops/reduction.cc    |   5 +-
 lib/op-attrs/src/op-attrs/ops/weight.cc       |   2 +-
 .../src/op-attrs/parallel_tensor_dims.cc      |  24 +-
 .../src/op-attrs/parallel_tensor_shape.cc     |  20 +-
 .../src/op-attrs/replica_parallel_dim_set.cc  |   8 +-
 lib/op-attrs/src/op-attrs/tensor_dims.cc      |  12 +-
 lib/op-attrs/src/op-attrs/tensor_shape.cc     |   8 +-
 .../test/src/op-attrs/ops/attention.cc        | 132 ++---
 .../test/src/op-attrs/ops/batch_matmul.cc     | 126 ++---
 .../test/src/op-attrs/ops/batch_norm.cc       |  84 +--
 lib/op-attrs/test/src/op-attrs/ops/cast.cc    |  28 +-
 lib/op-attrs/test/src/op-attrs/ops/combine.cc |  19 +-
 lib/op-attrs/test/src/op-attrs/ops/concat.cc  | 160 +++---
 lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc | 118 ++--
 lib/op-attrs/test/src/op-attrs/ops/dropout.cc |  48 +-
 .../test/src/op-attrs/ops/element_binary.cc   |  70 +--
 .../test/src/op-attrs/ops/element_unary.cc    |  30 +-
 .../test/src/op-attrs/ops/embedding.cc        |  56 +-
 lib/op-attrs/test/src/op-attrs/ops/flat.cc    | 130 ++---
 .../test/src/op-attrs/ops/layer_norm.cc       |  84 +--
 lib/op-attrs/test/src/op-attrs/ops/linear.cc  | 112 ++--
 lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc | 200 +++----
 .../test/src/op-attrs/ops/reduction.cc        |  19 +-
 .../test/src/op-attrs/ops/repartition.cc      |  14 +-
 .../test/src/op-attrs/ops/replicate.cc        |  16 +-
 lib/op-attrs/test/src/op-attrs/ops/softmax.cc |  52 +-
 .../test/src/op-attrs/pcg_operator_attrs.cc   |   2 +-
 lib/op-attrs/test/src/op-attrs/tensor_dims.cc |  24 +-
 .../include/pcg/computation_graph_builder.h   |  38 +-
 lib/pcg/include/pcg/machine_specification.h   |   8 +-
 .../pcg/machine_specification.struct.toml     |   8 +-
 lib/pcg/include/pcg/operator_task_space.h     |   2 +-
 .../pcg/operator_task_space.struct.toml       |   6 +-
 .../parallel_computation_graph_builder.h      |  34 +-
 lib/pcg/include/pcg/stride_t.struct.toml      |   4 +-
 lib/pcg/src/pcg/computation_graph_builder.cc  |  42 +-
 lib/pcg/src/pcg/machine_specification.cc      |   9 +-
 lib/pcg/src/pcg/machine_view.cc               |   8 +-
 lib/pcg/src/pcg/operator_task_space.cc        |   8 +-
 .../generate_weight_transform.cc              |   4 +-
 .../parallel_computation_graph_builder.cc     |  38 +-
 lib/pcg/test/src/pcg/computation_graph.cc     |  40 +-
 .../test/src/pcg/computation_graph_builder.cc |  14 +-
 .../file_format/v1/v1_computation_graph.cc    |   8 +-
 .../v1/v1_parallel_computation_graph.cc       |  10 +-
 lib/pcg/test/src/pcg/machine_specification.cc |   6 +-
 lib/pcg/test/src/pcg/machine_view.cc          |  74 +--
 lib/pcg/test/src/pcg/operator_task_space.cc   |   8 +-
 .../parallel_computation_graph.cc             |  34 +-
 .../parallel_computation_graph_builder.cc     | 136 ++---
 .../src/pcg/pcg_from_computation_graph.cc     |   8 +-
 .../src/pcg/start_invariant_machine_view.cc   |  32 +-
 .../operator_attribute_value.variant.toml     |   6 +-
 .../materialize_operator_from_attrs_map.cc    |  18 +-
 .../apply_substitution/apply_substitution.cc  |  20 +-
 .../evaluate_substitution_output.cc           |  16 +-
 .../perform_shape_inference.cc                |  14 +-
 .../operator_pattern/get_attribute.cc         |   2 +-
 .../test/src/substitutions/pcg_pattern.cc     |  20 +-
 .../substitutions/unity_substitution_set.cc   |   6 +-
 lib/task-spec/CMakeLists.txt                  |   2 +
 lib/task-spec/src/task-spec/ops/attention.cc  |  54 +-
 .../src/task-spec/ops/batch_matmul.cc         |  80 +--
 lib/task-spec/src/task-spec/ops/batch_norm.cc |  18 +-
 lib/task-spec/src/task-spec/ops/conv_2d.cc    |  10 +-
 lib/task-spec/src/task-spec/ops/gather.cc     |   6 +-
 lib/task-spec/src/task-spec/ops/layer_norm.cc |  12 +-
 lib/task-spec/src/task-spec/ops/linear.cc     |  32 +-
 lib/task-spec/src/task-spec/ops/pool_2d.cc    |  40 +-
 lib/task-spec/src/task-spec/ops/reduce.cc     |   2 +-
 lib/task-spec/src/task-spec/ops/reduction.cc  |   4 +-
 lib/task-spec/src/task-spec/ops/replicate.cc  |   2 +-
 lib/task-spec/src/task-spec/ops/softmax.cc    |  18 +-
 lib/task-spec/src/task-spec/ops/split.cc      |  20 +-
 lib/task-spec/src/task-spec/ops/topk.cc       |  20 +-
 lib/task-spec/test/src/task-spec/arg_ref.cc   |   2 -
 lib/utils/include/utils/containers/sum.h      |  17 +-
 .../include/utils/nonnegative_int/ceildiv.h   |   1 +
 .../include/utils/positive_int/ceildiv.h      |  12 +
 .../include/utils/positive_int/positive_int.h | 114 ++++
 .../src/utils/nonnegative_int/ceildiv.cc      |  20 -
 lib/utils/src/utils/positive_int/ceildiv.cc   |  14 +
 .../src/utils/positive_int/positive_int.cc    | 283 ++++++++++
 lib/utils/test/src/utils/containers/sum.cc    |  18 +
 .../test/src/utils/nonnegative_int/ceildiv.cc |  52 --
 .../test/src/utils/positive_int/ceildiv.cc    |  28 +
 .../src/utils/positive_int/positive_int.cc    |  10 +
 249 files changed, 4212 insertions(+), 3136 deletions(-)
 create mode 100644 lib/kernels/include/kernels/fill_tensor_accessor.h
 create mode 100644 lib/kernels/include/kernels/reduce_tensor_accessor.h
 delete mode 100644 lib/kernels/include/kernels/reduce_tensor_accessors.h
 create mode 100644 lib/kernels/include/kernels/tensor_accessor_reductions.h
 create mode 100644 lib/kernels/src/kernels/fill_tensor_accessor.cc
 create mode 100644 lib/kernels/src/kernels/reduce_tensor_accessor.cc
 create mode 100644 lib/kernels/src/kernels/tensor_accessor_reductions.cc
 delete mode 100644 lib/kernels/test/modify_test_commands.cmake
 create mode 100644 lib/kernels/test/src/kernels/create_accessor_with_contents.cc
 create mode 100644 lib/kernels/test/src/kernels/map_tensor_accessors.cc
 create mode 100644 lib/kernels/test/src/kernels/reduce_tensor_accessor.cc
 create mode 100644 lib/kernels/test/src/kernels/tensor_accessor_reductions.cc
 delete mode 100644 lib/local-execution/test/src/test_local_slots_backing.cc
 create mode 100644 lib/utils/include/utils/positive_int/ceildiv.h
 create mode 100644 lib/utils/include/utils/positive_int/positive_int.h
 delete mode 100644 lib/utils/src/utils/nonnegative_int/ceildiv.cc
 create mode 100644 lib/utils/src/utils/positive_int/ceildiv.cc
 create mode 100644 lib/utils/src/utils/positive_int/positive_int.cc
 delete mode 100644 lib/utils/test/src/utils/nonnegative_int/ceildiv.cc
 create mode 100644 lib/utils/test/src/utils/positive_int/ceildiv.cc
 create mode 100644 lib/utils/test/src/utils/positive_int/positive_int.cc

diff --git a/lib/compiler/src/compiler/allowed_machine_views.cc b/lib/compiler/src/compiler/allowed_machine_views.cc
index 6f86d1d82a..fa543e78b5 100644
--- a/lib/compiler/src/compiler/allowed_machine_views.cc
+++ b/lib/compiler/src/compiler/allowed_machine_views.cc
@@ -17,7 +17,7 @@
 #include "utils/containers/unordered_multiset_of.h"
 #include "utils/containers/unordered_set_of.h"
 #include "utils/containers/zip.h"
-#include "utils/nonnegative_int/ceildiv.h"
+#include "utils/positive_int/ceildiv.h"
 #include "utils/nonnegative_int/nonnegative_range.h"
 #include "utils/nonnegative_int/num_elements.h"
 #include "utils/overload.h"
@@ -51,24 +51,24 @@ static std::unordered_set<MachineView>
                                 DeviceType const &device_type) {
 
   auto get_max_stride_upper_bound =
-      [](std::vector<nonnegative_int> const &tensor_dims,
-         nonnegative_int total_devices) -> nonnegative_int {
+      [](std::vector<positive_int> const &tensor_dims,
+         positive_int total_devices) -> positive_int {
     nonnegative_int min_num_devices_with_full_stride_volume =
-        product(transform(tensor_dims, [](nonnegative_int num_devices) {
-          return nonnegative_int{num_devices.unwrap_nonnegative() - 1};
+        product(transform(tensor_dims, [](positive_int num_devices) {
+          return nonnegative_int{num_devices.int_from_positive_int() - 1};
         }));
-    return ceildiv(total_devices, min_num_devices_with_full_stride_volume);
+    return ceildiv(total_devices, positive_int{min_num_devices_with_full_stride_volume});
   };
 
-  auto candidate_strides = [&](std::vector<nonnegative_int> const &tensor_dims,
-                               nonnegative_int total_devices)
+  auto candidate_strides = [&](std::vector<positive_int> const &tensor_dims,
+                               positive_int total_devices)
       -> std::unordered_multiset<MultiDimensionalStride> {
-    nonnegative_int max_stride_upper_bound =
+    positive_int max_stride_upper_bound =
         get_max_stride_upper_bound(tensor_dims, total_devices);
 
     std::vector<stride_t> single_stride_range =
-        transform(nonnegative_range(1_n, max_stride_upper_bound + 1_n),
-                  [](nonnegative_int stride) { return stride_t{stride}; });
+        transform(nonnegative_range(1_n, max_stride_upper_bound.nonnegative_int_from_positive_int() + 1_n),
+                  [](nonnegative_int stride) { return stride_t{positive_int{stride}}; });
     std::unordered_multiset<std::vector<stride_t>> raw_stride_vectors =
         cartesian_product(
             repeat_element(/*num_times=*/num_elements(tensor_dims),
@@ -83,9 +83,9 @@ static std::unordered_set<MachineView>
   auto candidate_starts = [](MachineSpecification const &ms,
                              DeviceType const &device_type) {
     std::unordered_set<MachineSpaceCoordinate> result;
-    for (nonnegative_int node_idx : nonnegative_range(ms.num_nodes)) {
+    for (nonnegative_int node_idx : nonnegative_range(ms.num_nodes.nonnegative_int_from_positive_int())) {
       for (nonnegative_int device_idx :
-           nonnegative_range(get_num_devices_per_node(ms, device_type))) {
+           nonnegative_range(get_num_devices_per_node(ms, device_type).nonnegative_int_from_positive_int())) {
         result.insert(
             MachineSpaceCoordinate{node_idx, device_idx, device_type});
       }
@@ -100,8 +100,8 @@ static std::unordered_set<MachineView>
     return get_all_permutations_with_repetition(options, num_dims(task));
   };
 
-  std::vector<nonnegative_int> tensor_dims = task.degrees;
-  nonnegative_int total_devices = get_num_devices(machine_spec, device_type);
+  std::vector<positive_int> tensor_dims = task.degrees;
+  positive_int total_devices = get_num_devices(machine_spec, device_type);
 
   std::unordered_set<MachineView> machine_views;
 
diff --git a/lib/compiler/src/compiler/machine_mapping/get_machine_resource_splits.cc b/lib/compiler/src/compiler/machine_mapping/get_machine_resource_splits.cc
index bb9d54f1e9..e921a0c465 100644
--- a/lib/compiler/src/compiler/machine_mapping/get_machine_resource_splits.cc
+++ b/lib/compiler/src/compiler/machine_mapping/get_machine_resource_splits.cc
@@ -11,9 +11,9 @@ std::unordered_set<std::pair<MachineSpecification, MachineSpecification>>
   for (int i = 1; i < resource.num_nodes; i *= 2) {
     MachineSpecification sub_resource1 = resource;
     MachineSpecification sub_resource2 = resource;
-    sub_resource1.num_nodes = nonnegative_int{i};
+    sub_resource1.num_nodes = positive_int{i};
     sub_resource2.num_nodes =
-        nonnegative_int{resource.num_nodes.unwrap_nonnegative() - i};
+        positive_int{resource.num_nodes.int_from_positive_int() - i};
     result.insert(std::make_pair(sub_resource1, sub_resource2));
     result.insert(std::make_pair(sub_resource2, sub_resource1));
   }
@@ -21,9 +21,9 @@ std::unordered_set<std::pair<MachineSpecification, MachineSpecification>>
   for (int i = 1; i < resource.num_gpus_per_node; i *= 2) {
     MachineSpecification sub_resource1 = resource;
     MachineSpecification sub_resource2 = resource;
-    sub_resource1.num_gpus_per_node = nonnegative_int{i};
+    sub_resource1.num_gpus_per_node = positive_int{i};
     sub_resource2.num_gpus_per_node =
-        nonnegative_int{resource.num_gpus_per_node.unwrap_nonnegative() - i};
+        positive_int{resource.num_gpus_per_node.int_from_positive_int() - i};
     result.insert(std::make_pair(sub_resource1, sub_resource2));
     result.insert(std::make_pair(sub_resource2, sub_resource1));
   }
diff --git a/lib/compiler/test/src/allowed_machine_views.cc b/lib/compiler/test/src/allowed_machine_views.cc
index 817cc80700..15f7d60060 100644
--- a/lib/compiler/test/src/allowed_machine_views.cc
+++ b/lib/compiler/test/src/allowed_machine_views.cc
@@ -15,39 +15,39 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("1 degree of parallelism") {
       MachineSpecification ms = MachineSpecification{
-          /*num_nodes=*/1_n,
-          /*num_cpus_per_node=*/5_n,
-          /*num_gpus_per_node=*/5_n,
+          /*num_nodes=*/1_p,
+          /*num_cpus_per_node=*/5_p,
+          /*num_gpus_per_node=*/5_p,
           /*inter_node_bandwidth=*/0,
           /*intra_node_bandwidth=*/0,
       };
 
-      OperatorTaskSpace task = OperatorTaskSpace{{3_n}};
+      OperatorTaskSpace task = OperatorTaskSpace{{3_p}};
 
       std::unordered_set<MachineView> correct = {
           MachineView{
               MachineSpaceCoordinate{
                   /*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU},
-              {MachineViewDimension{stride_t{1_n},
+              {MachineViewDimension{stride_t{1_p},
                                     MachineSpecificationDimension::INTRA_NODE}},
           },
 
           MachineView{
               MachineSpaceCoordinate{
                   /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU},
-              {MachineViewDimension{stride_t{1_n},
+              {MachineViewDimension{stride_t{1_p},
                                     MachineSpecificationDimension::INTRA_NODE}},
           },
           MachineView{
               MachineSpaceCoordinate{
                   /*node_idx=*/0_n, /*device_idx=*/2_n, DeviceType::GPU},
-              {MachineViewDimension{stride_t{1_n},
+              {MachineViewDimension{stride_t{1_p},
                                     MachineSpecificationDimension::INTRA_NODE}},
           },
           MachineView{
               MachineSpaceCoordinate{
                   /*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU},
-              {MachineViewDimension{stride_t{2_n},
+              {MachineViewDimension{stride_t{2_p},
                                     MachineSpecificationDimension::INTRA_NODE}},
           },
       };
@@ -61,18 +61,18 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("2 degrees of parallelism") {
 
       MachineSpecification ms = MachineSpecification{
-          /*num_nodes=*/3_n,
-          /*num_cpus_per_node=*/3_n,
-          /*num_gpus_per_node=*/3_n,
+          /*num_nodes=*/3_p,
+          /*num_cpus_per_node=*/3_p,
+          /*num_gpus_per_node=*/3_p,
           /*inter_node_bandwidth=*/0,
           /*intra_node_bandwidth=*/0,
       };
-      OperatorTaskSpace task = OperatorTaskSpace{{2_n, 3_n}};
+      OperatorTaskSpace task = OperatorTaskSpace{{2_p, 3_p}};
 
       auto make_2d_view = [&](nonnegative_int start_node_idx,
                               nonnegative_int start_device_idx,
-                              nonnegative_int stride1,
-                              nonnegative_int stride2,
+                              positive_int stride1,
+                              positive_int stride2,
                               MachineSpecificationDimension m1,
                               MachineSpecificationDimension m2) {
         return MachineView{
@@ -87,18 +87,18 @@ TEST_SUITE(FF_TEST_SUITE) {
       auto inter = MachineSpecificationDimension::INTER_NODE;
       std::unordered_set<MachineView> correct = {
           make_2d_view(
-              0_n, 0_n, /*stride1=*/1_n, /*stride2=*/1_n, inter, intra),
+              0_n, 0_n, /*stride1=*/1_p, /*stride2=*/1_p, inter, intra),
           make_2d_view(
-              1_n, 0_n, /*stride1=*/1_n, /*stride2=*/1_n, inter, intra),
+              1_n, 0_n, /*stride1=*/1_p, /*stride2=*/1_p, inter, intra),
           make_2d_view(
-              0_n, 0_n, /*stride1=*/2_n, /*stride2=*/1_n, inter, intra),
+              0_n, 0_n, /*stride1=*/2_p, /*stride2=*/1_p, inter, intra),
 
           make_2d_view(
-              0_n, 0_n, /*stride1=*/1_n, /*stride2=*/1_n, intra, inter),
+              0_n, 0_n, /*stride1=*/1_p, /*stride2=*/1_p, intra, inter),
           make_2d_view(
-              0_n, 1_n, /*stride1=*/1_n, /*stride2=*/1_n, intra, inter),
+              0_n, 1_n, /*stride1=*/1_p, /*stride2=*/1_p, intra, inter),
           make_2d_view(
-              0_n, 0_n, /*stride1=*/2_n, /*stride2=*/1_n, intra, inter),
+              0_n, 0_n, /*stride1=*/2_p, /*stride2=*/1_p, intra, inter),
       };
 
       std::unordered_set<MachineView> result =
diff --git a/lib/compiler/test/src/compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.cc b/lib/compiler/test/src/compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.cc
index 13067f5d02..0416a73660 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.cc
@@ -28,9 +28,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
-                10_n,
-                12_n,
+            FFOrdered{
+                10_p,
+                12_p,
             },
         },
         DataType::FLOAT,
@@ -42,7 +42,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*op_attrs=*/PCGOperatorAttrs{
             RepartitionAttrs{
                 /*repartition_dim=*/ff_dim_t{0_n},
-                /*repartition_degree=*/2_n,
+                /*repartition_degree=*/2_p,
             },
         },
         /*name=*/std::nullopt,
diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_machine_resource_splits.cc b/lib/compiler/test/src/compiler/machine_mapping/get_machine_resource_splits.cc
index 5f4ba2bfdc..5ae89a8123 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/get_machine_resource_splits.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/get_machine_resource_splits.cc
@@ -8,11 +8,11 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_machine_resource_splits") {
-    auto make_machine_spec = [](nonnegative_int num_nodes,
-                                nonnegative_int num_gpus_per_node) {
+    auto make_machine_spec = [](positive_int num_nodes,
+                                positive_int num_gpus_per_node) {
       return MachineSpecification{
           /*num_nodes=*/num_nodes,
-          /*num_cpus_per_node=*/1_n,
+          /*num_cpus_per_node=*/1_p,
           /*num_gpus_per_node=*/num_gpus_per_node,
           /*inter_node_bandwidth=*/1.0,
           /*intra_node_bandwidth=*/1.0,
@@ -20,8 +20,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     SUBCASE("returns no splits if no splits are possible") {
-      MachineSpecification input = make_machine_spec(/*num_nodes=*/1_n,
-                                                     /*num_gpus_per_node=*/1_n);
+      MachineSpecification input = make_machine_spec(/*num_nodes=*/1_p,
+                                                     /*num_gpus_per_node=*/1_p);
 
       std::unordered_set<std::pair<MachineSpecification, MachineSpecification>>
           result = get_machine_resource_splits(input);
@@ -33,8 +33,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE(
         "returns splits in gpu and node dimensions, but not at the same time") {
-      MachineSpecification input = make_machine_spec(/*num_nodes=*/2_n,
-                                                     /*num_gpus_per_node=*/2_n);
+      MachineSpecification input = make_machine_spec(/*num_nodes=*/2_p,
+                                                     /*num_gpus_per_node=*/2_p);
 
       std::unordered_set<std::pair<MachineSpecification, MachineSpecification>>
           result = get_machine_resource_splits(input);
@@ -42,16 +42,16 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::unordered_set<std::pair<MachineSpecification, MachineSpecification>>
           correct = {
               {
-                  make_machine_spec(/*num_nodes=*/2_n,
-                                    /*num_gpus_per_node=*/1_n),
-                  make_machine_spec(/*num_nodes=*/2_n,
-                                    /*num_gpus_per_node=*/1_n),
+                  make_machine_spec(/*num_nodes=*/2_p,
+                                    /*num_gpus_per_node=*/1_p),
+                  make_machine_spec(/*num_nodes=*/2_p,
+                                    /*num_gpus_per_node=*/1_p),
               },
               {
-                  make_machine_spec(/*num_nodes=*/1_n,
-                                    /*num_gpus_per_node=*/2_n),
-                  make_machine_spec(/*num_nodes=*/1_n,
-                                    /*num_gpus_per_node=*/2_n),
+                  make_machine_spec(/*num_nodes=*/1_p,
+                                    /*num_gpus_per_node=*/2_p),
+                  make_machine_spec(/*num_nodes=*/1_p,
+                                    /*num_gpus_per_node=*/2_p),
               },
 
           };
@@ -62,8 +62,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("returns splits in node dimension in powers of two") {
       SUBCASE("num_nodes is a power of 2") {
         MachineSpecification input =
-            make_machine_spec(/*num_nodes=*/8_n,
-                              /*num_gpus_per_node=*/1_n);
+            make_machine_spec(/*num_nodes=*/8_p,
+                              /*num_gpus_per_node=*/1_p);
 
         std::unordered_set<
             std::pair<MachineSpecification, MachineSpecification>>
@@ -73,34 +73,34 @@ TEST_SUITE(FF_TEST_SUITE) {
             std::pair<MachineSpecification, MachineSpecification>>
             correct = {
                 {
-                    make_machine_spec(/*num_nodes=*/1_n,
-                                      /*num_gpus_per_node=*/1_n),
-                    make_machine_spec(/*num_nodes=*/7_n,
-                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/1_p,
+                                      /*num_gpus_per_node=*/1_p),
+                    make_machine_spec(/*num_nodes=*/7_p,
+                                      /*num_gpus_per_node=*/1_p),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/2_n,
-                                      /*num_gpus_per_node=*/1_n),
-                    make_machine_spec(/*num_nodes=*/6_n,
-                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/2_p,
+                                      /*num_gpus_per_node=*/1_p),
+                    make_machine_spec(/*num_nodes=*/6_p,
+                                      /*num_gpus_per_node=*/1_p),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/4_n,
-                                      /*num_gpus_per_node=*/1_n),
-                    make_machine_spec(/*num_nodes=*/4_n,
-                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/4_p,
+                                      /*num_gpus_per_node=*/1_p),
+                    make_machine_spec(/*num_nodes=*/4_p,
+                                      /*num_gpus_per_node=*/1_p),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/6_n,
-                                      /*num_gpus_per_node=*/1_n),
-                    make_machine_spec(/*num_nodes=*/2_n,
-                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/6_p,
+                                      /*num_gpus_per_node=*/1_p),
+                    make_machine_spec(/*num_nodes=*/2_p,
+                                      /*num_gpus_per_node=*/1_p),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/7_n,
-                                      /*num_gpus_per_node=*/1_n),
-                    make_machine_spec(/*num_nodes=*/1_n,
-                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/7_p,
+                                      /*num_gpus_per_node=*/1_p),
+                    make_machine_spec(/*num_nodes=*/1_p,
+                                      /*num_gpus_per_node=*/1_p),
                 },
             };
 
@@ -109,8 +109,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       SUBCASE("num_nodes is not a power of 2") {
         MachineSpecification input =
-            make_machine_spec(/*num_nodes=*/6_n,
-                              /*num_gpus_per_node=*/1_n);
+            make_machine_spec(/*num_nodes=*/6_p,
+                              /*num_gpus_per_node=*/1_p);
 
         std::unordered_set<
             std::pair<MachineSpecification, MachineSpecification>>
@@ -120,28 +120,28 @@ TEST_SUITE(FF_TEST_SUITE) {
             std::pair<MachineSpecification, MachineSpecification>>
             correct = {
                 {
-                    make_machine_spec(/*num_nodes=*/1_n,
-                                      /*num_gpus_per_node=*/1_n),
-                    make_machine_spec(/*num_nodes=*/5_n,
-                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/1_p,
+                                      /*num_gpus_per_node=*/1_p),
+                    make_machine_spec(/*num_nodes=*/5_p,
+                                      /*num_gpus_per_node=*/1_p),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/2_n,
-                                      /*num_gpus_per_node=*/1_n),
-                    make_machine_spec(/*num_nodes=*/4_n,
-                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/2_p,
+                                      /*num_gpus_per_node=*/1_p),
+                    make_machine_spec(/*num_nodes=*/4_p,
+                                      /*num_gpus_per_node=*/1_p),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/4_n,
-                                      /*num_gpus_per_node=*/1_n),
-                    make_machine_spec(/*num_nodes=*/2_n,
-                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/4_p,
+                                      /*num_gpus_per_node=*/1_p),
+                    make_machine_spec(/*num_nodes=*/2_p,
+                                      /*num_gpus_per_node=*/1_p),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/5_n,
-                                      /*num_gpus_per_node=*/1_n),
-                    make_machine_spec(/*num_nodes=*/1_n,
-                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/5_p,
+                                      /*num_gpus_per_node=*/1_p),
+                    make_machine_spec(/*num_nodes=*/1_p,
+                                      /*num_gpus_per_node=*/1_p),
                 },
             };
 
@@ -152,8 +152,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("returns splits in gpu dimension in powers of two") {
       SUBCASE("num_gpus_per_node is a power of 2") {
         MachineSpecification input =
-            make_machine_spec(/*num_nodes=*/1_n,
-                              /*num_gpus_per_node=*/8_n);
+            make_machine_spec(/*num_nodes=*/1_p,
+                              /*num_gpus_per_node=*/8_p);
 
         std::unordered_set<
             std::pair<MachineSpecification, MachineSpecification>>
@@ -163,34 +163,34 @@ TEST_SUITE(FF_TEST_SUITE) {
             std::pair<MachineSpecification, MachineSpecification>>
             correct = {
                 {
-                    make_machine_spec(/*num_nodes=*/1_n,
-                                      /*num_gpus_per_node=*/1_n),
-                    make_machine_spec(/*num_nodes=*/1_n,
-                                      /*num_gpus_per_node=*/7_n),
+                    make_machine_spec(/*num_nodes=*/1_p,
+                                      /*num_gpus_per_node=*/1_p),
+                    make_machine_spec(/*num_nodes=*/1_p,
+                                      /*num_gpus_per_node=*/7_p),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/1_n,
-                                      /*num_gpus_per_node=*/2_n),
-                    make_machine_spec(/*num_nodes=*/1_n,
-                                      /*num_gpus_per_node=*/6_n),
+                    make_machine_spec(/*num_nodes=*/1_p,
+                                      /*num_gpus_per_node=*/2_p),
+                    make_machine_spec(/*num_nodes=*/1_p,
+                                      /*num_gpus_per_node=*/6_p),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/1_n,
-                                      /*num_gpus_per_node=*/4_n),
-                    make_machine_spec(/*num_nodes=*/1_n,
-                                      /*num_gpus_per_node=*/4_n),
+                    make_machine_spec(/*num_nodes=*/1_p,
+                                      /*num_gpus_per_node=*/4_p),
+                    make_machine_spec(/*num_nodes=*/1_p,
+                                      /*num_gpus_per_node=*/4_p),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/1_n,
-                                      /*num_gpus_per_node=*/6_n),
-                    make_machine_spec(/*num_nodes=*/1_n,
-                                      /*num_gpus_per_node=*/2_n),
+                    make_machine_spec(/*num_nodes=*/1_p,
+                                      /*num_gpus_per_node=*/6_p),
+                    make_machine_spec(/*num_nodes=*/1_p,
+                                      /*num_gpus_per_node=*/2_p),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/1_n,
-                                      /*num_gpus_per_node=*/7_n),
-                    make_machine_spec(/*num_nodes=*/1_n,
-                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/1_p,
+                                      /*num_gpus_per_node=*/7_p),
+                    make_machine_spec(/*num_nodes=*/1_p,
+                                      /*num_gpus_per_node=*/1_p),
                 },
             };
 
@@ -199,8 +199,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       SUBCASE("num_gpus_per_node is not a power of 2") {
         MachineSpecification input =
-            make_machine_spec(/*num_nodes=*/1_n,
-                              /*num_gpus_per_node=*/6_n);
+            make_machine_spec(/*num_nodes=*/1_p,
+                              /*num_gpus_per_node=*/6_p);
 
         std::unordered_set<
             std::pair<MachineSpecification, MachineSpecification>>
@@ -210,28 +210,28 @@ TEST_SUITE(FF_TEST_SUITE) {
             std::pair<MachineSpecification, MachineSpecification>>
             correct = {
                 {
-                    make_machine_spec(/*num_nodes=*/1_n,
-                                      /*num_gpus_per_node=*/1_n),
-                    make_machine_spec(/*num_nodes=*/1_n,
-                                      /*num_gpus_per_node=*/5_n),
+                    make_machine_spec(/*num_nodes=*/1_p,
+                                      /*num_gpus_per_node=*/1_p),
+                    make_machine_spec(/*num_nodes=*/1_p,
+                                      /*num_gpus_per_node=*/5_p),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/1_n,
-                                      /*num_gpus_per_node=*/2_n),
-                    make_machine_spec(/*num_nodes=*/1_n,
-                                      /*num_gpus_per_node=*/4_n),
+                    make_machine_spec(/*num_nodes=*/1_p,
+                                      /*num_gpus_per_node=*/2_p),
+                    make_machine_spec(/*num_nodes=*/1_p,
+                                      /*num_gpus_per_node=*/4_p),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/1_n,
-                                      /*num_gpus_per_node=*/4_n),
-                    make_machine_spec(/*num_nodes=*/1_n,
-                                      /*num_gpus_per_node=*/2_n),
+                    make_machine_spec(/*num_nodes=*/1_p,
+                                      /*num_gpus_per_node=*/4_p),
+                    make_machine_spec(/*num_nodes=*/1_p,
+                                      /*num_gpus_per_node=*/2_p),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/1_n,
-                                      /*num_gpus_per_node=*/5_n),
-                    make_machine_spec(/*num_nodes=*/1_n,
-                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/1_p,
+                                      /*num_gpus_per_node=*/5_p),
+                    make_machine_spec(/*num_nodes=*/1_p,
+                                      /*num_gpus_per_node=*/1_p),
                 },
             };
       }
diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
index e506dea1d7..c3342c1b3a 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
@@ -53,7 +53,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1_n},
+                stride_t{1_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -68,24 +68,24 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2_n},
+                stride_t{2_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
     };
 
     MachineSpecification full_machine_spec = MachineSpecification{
-        /*num_nodes=*/2_n,
-        /*num_cpus_per_node=*/1_n,
-        /*num_gpus_per_node=*/1_n,
+        /*num_nodes=*/2_p,
+        /*num_cpus_per_node=*/1_p,
+        /*num_gpus_per_node=*/1_p,
         /*inter_node_bandwidth=*/1,
         /*intra_node_bandwidth=*/1,
     };
 
     MachineSpecification split_machine_spec = MachineSpecification{
-        /*num_nodes=*/1_n,
-        /*num_cpus_per_node=*/1_n,
-        /*num_gpus_per_node=*/1_n,
+        /*num_nodes=*/1_p,
+        /*num_cpus_per_node=*/1_p,
+        /*num_gpus_per_node=*/1_p,
         /*inter_node_bandwidth=*/1,
         /*intra_node_bandwidth=*/1,
     };
@@ -101,9 +101,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape tensor_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
-                10_n,
-                8_n,
+            FFOrdered{
+                10_p,
+                8_p,
             },
         },
         DataType::FLOAT,
diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc b/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc
index 51e6074bf2..c5b68e3a76 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc
@@ -41,9 +41,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
-                10_n,
-                12_n,
+            FFOrdered{
+                10_p,
+                12_p,
             },
         },
         DataType::FLOAT,
@@ -56,7 +56,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*op_attrs=*/PCGOperatorAttrs{
             RepartitionAttrs{
                 /*repartition_dim=*/ff_dim_t{0_n},
-                /*repartition_degree=*/2_n,
+                /*repartition_degree=*/2_p,
             },
         },
         /*name=*/std::nullopt,
@@ -106,7 +106,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1_n},
+                stride_t{1_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -121,7 +121,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2_n},
+                stride_t{2_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -136,7 +136,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{3_n},
+                stride_t{3_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -151,7 +151,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{4_n},
+                stride_t{4_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc
index e88b714bd4..928d30ecaa 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc
@@ -16,7 +16,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1_n},
+                stride_t{1_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -31,7 +31,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2_n},
+                stride_t{2_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -62,7 +62,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1_n},
+                stride_t{1_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -77,7 +77,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2_n},
+                stride_t{2_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
index 048f1ddcac..d2c829df30 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
@@ -65,9 +65,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
-                10_n,
-                1_n,
+            FFOrdered{
+                10_p,
+                1_p,
             },
         },
         DataType::FLOAT,
diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc
index 4a261bcdae..c7a757d91f 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc
@@ -15,7 +15,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1_n},
+                stride_t{1_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -30,7 +30,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2_n},
+                stride_t{2_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -196,7 +196,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1_n},
+                stride_t{1_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -211,7 +211,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2_n},
+                stride_t{2_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -319,7 +319,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1_n},
+                stride_t{1_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -334,7 +334,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2_n},
+                stride_t{2_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
index 8ae1ebe753..22202c36f7 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
@@ -53,7 +53,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1_n},
+                stride_t{1_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -68,24 +68,24 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2_n},
+                stride_t{2_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
     };
 
     MachineSpecification full_machine_spec = MachineSpecification{
-        /*num_nodes=*/2_n,
-        /*num_cpus_per_node=*/1_n,
-        /*num_gpus_per_node=*/1_n,
+        /*num_nodes=*/2_p,
+        /*num_cpus_per_node=*/1_p,
+        /*num_gpus_per_node=*/1_p,
         /*inter_node_bandwidth=*/1,
         /*intra_node_bandwidth=*/1,
     };
 
     MachineSpecification split_machine_spec = MachineSpecification{
-        /*num_nodes=*/1_n,
-        /*num_cpus_per_node=*/1_n,
-        /*num_gpus_per_node=*/1_n,
+        /*num_nodes=*/1_p,
+        /*num_cpus_per_node=*/1_p,
+        /*num_gpus_per_node=*/1_p,
         /*inter_node_bandwidth=*/1,
         /*intra_node_bandwidth=*/1,
     };
@@ -101,9 +101,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape tensor_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
-                12_n,
-                8_n,
+            FFOrdered{
+                12_p,
+                8_p,
             },
         },
         DataType::FLOAT,
diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
index 04149cae8f..35b55d2273 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
@@ -16,7 +16,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1_n},
+                stride_t{1_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -31,7 +31,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2_n},
+                stride_t{2_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -46,7 +46,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{4_n},
+                stride_t{4_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -166,7 +166,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1_n},
+                stride_t{1_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -181,7 +181,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2_n},
+                stride_t{2_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -367,7 +367,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1_n},
+                stride_t{1_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -382,7 +382,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2_n},
+                stride_t{2_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -499,7 +499,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1_n},
+                stride_t{1_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -514,7 +514,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2_n},
+                stride_t{2_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -529,7 +529,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{4_n},
+                stride_t{4_p},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
diff --git a/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc b/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc
index fcd508828c..1c801161ca 100644
--- a/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc
+++ b/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc
@@ -37,9 +37,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     InitializerAttrs zero_init = InitializerAttrs{ZeroInitializerAttrs{}};
 
     TensorShape input_shape = TensorShape{TensorDims{
-                                              FFOrdered<nonnegative_int>{
-                                                  10_n,
-                                                  12_n,
+                                              FFOrdered{
+                                                  10_p,
+                                                  12_p,
                                               },
                                           },
                                           DataType::FLOAT};
@@ -62,7 +62,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("single operator plus inputs and weights") {
       LinearAttrs linear_attrs = LinearAttrs{
-          /*out_channels=*/14_n,
+          /*out_channels=*/14_p,
           /*use_bias=*/true,
           /*data_type=*/DataType::FLOAT,
           /*activation=*/std::nullopt,
@@ -126,7 +126,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       //   op1     op2
 
       LinearAttrs linear_attrs = LinearAttrs{
-          /*out_channels=*/14_n,
+          /*out_channels=*/14_p,
           /*use_bias=*/false,
           /*data_type=*/DataType::FLOAT,
           /*activation=*/std::nullopt,
@@ -267,7 +267,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("real models") {
       SUBCASE("split_test") {
         ComputationGraph cg =
-            get_split_test_computation_graph(/*batch_size=*/8_n);
+            get_split_test_computation_graph(/*batch_size=*/8_p);
 
         std::optional<SeriesParallelDecomposition> sp_decomposition =
             get_computation_graph_series_parallel_decomposition(cg);
@@ -340,14 +340,14 @@ TEST_SUITE(FF_TEST_SUITE) {
         ComputationGraphBuilder b;
 
         TensorShape input_shape =
-            TensorShape{TensorDims{FFOrdered<nonnegative_int>{
-                            10_n,
-                            12_n,
+            TensorShape{TensorDims{FFOrdered{
+                            10_p,
+                            12_p,
                         }},
                         DataType::FLOAT};
         tensor_guid_t input = b.create_input(input_shape, CreateGrad::YES);
 
-        b.dense(input, /*outDim=*/14_n);
+        b.dense(input, /*outDim=*/14_p);
 
         return b.computation_graph;
       }();
@@ -358,7 +358,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("split_test") {
       ComputationGraph cg =
-          get_split_test_computation_graph(/*batch_size=*/8_n);
+          get_split_test_computation_graph(/*batch_size=*/8_p);
 
       std::string result =
           render_preprocessed_computation_graph_for_sp_decomposition(cg);
diff --git a/lib/compiler/test/src/compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.cc b/lib/compiler/test/src/compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.cc
index 06664b38fa..13f15f6db3 100644
--- a/lib/compiler/test/src/compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.cc
+++ b/lib/compiler/test/src/compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.cc
@@ -22,9 +22,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(result == correct);
     }
 
-    TensorShape input_shape = TensorShape{TensorDims{FFOrdered<nonnegative_int>{
-                                              10_n,
-                                              12_n,
+    TensorShape input_shape = TensorShape{TensorDims{FFOrdered{
+                                              10_p,
+                                              12_p,
                                           }},
                                           DataType::FLOAT};
     InitializerAttrs zero_init = InitializerAttrs{ZeroInitializerAttrs{}};
@@ -58,7 +58,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       parallel_tensor_guid_t t_input = get_only(input_added.outputs);
 
       LinearAttrs linear_attrs = LinearAttrs{
-          /*out_channels=*/14_n,
+          /*out_channels=*/14_p,
           /*use_bias=*/true,
           /*data_type=*/DataType::FLOAT,
           /*activation=*/Activation::RELU,
@@ -133,7 +133,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       };
 
       LinearAttrs linear_attrs = LinearAttrs{
-          /*out_channels=*/14_n,
+          /*out_channels=*/14_p,
           /*use_bias=*/false,
           /*data_type=*/DataType::FLOAT,
           /*activation=*/std::nullopt,
@@ -204,9 +204,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       ParallelComputationGraph pcg = empty_parallel_computation_graph();
 
       TensorShape input_shape = TensorShape{
-          TensorDims{FFOrdered<nonnegative_int>{
-              12_n,
-              10_n,
+          TensorDims{FFOrdered{
+              12_p,
+              10_p,
           }},
           DataType::FLOAT,
       };
@@ -218,7 +218,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       RepartitionAttrs p2_attrs = RepartitionAttrs{
           /*repartition_dim=*/ff_dim_t{0_n},
-          /*repartition_degree=*/3_n,
+          /*repartition_degree=*/3_p,
       };
       ParallelLayerAddedResult p2_added =
           add_parallel_layer(pcg, make_layer_attrs(p2_attrs), {t_input}, {});
@@ -227,7 +227,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       ParallelLayerAttrs p3_attrs = ParallelLayerAttrs{
           PCGOperatorAttrs{RepartitionAttrs{
               /*repartition_dim=*/ff_dim_t{1_n},
-              /*repartition_degree=*/2_n,
+              /*repartition_degree=*/2_p,
           }},
           /*name=*/std::nullopt,
       };
@@ -243,8 +243,8 @@ TEST_SUITE(FF_TEST_SUITE) {
       parallel_tensor_guid_t t_op0 = get_only(op0_added.outputs);
 
       EmbeddingAttrs op1_attrs = EmbeddingAttrs{
-          /*num_entires=*/100_n,
-          /*out_channels=*/22_n,
+          /*num_entires=*/100_p,
+          /*out_channels=*/22_p,
           /*aggr=*/AggregateOp::SUM,
           /*data_type=*/DataType::FLOAT,
       };
@@ -262,7 +262,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       parallel_tensor_guid_t t_w1 = get_only(w1_added.outputs);
 
       ReplicateAttrs p1_attrs = ReplicateAttrs{
-          /*replicate_degree=*/6_n,
+          /*replicate_degree=*/6_p,
       };
       ParallelLayerAddedResult p1_added =
           add_parallel_layer(pcg, make_layer_attrs(p1_attrs), {t_w1}, {});
@@ -272,7 +272,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           add_parallel_layer(pcg, make_layer_attrs(op1_attrs), {t_op0}, {t_p1});
 
       LinearAttrs op2_attrs = LinearAttrs{
-          /*out_channels=*/14_n,
+          /*out_channels=*/14_p,
           /*use_bias=*/false,
           /*data_type=*/DataType::FLOAT,
           /*activation=*/std::nullopt,
@@ -289,7 +289,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       parallel_tensor_guid_t t_w2 = get_only(w2_added.outputs);
 
       ReplicateAttrs p4_attrs = ReplicateAttrs{
-          /*replicate_degree=*/3_n,
+          /*replicate_degree=*/3_p,
       };
       ParallelLayerAddedResult p4_added =
           add_parallel_layer(pcg, make_layer_attrs(p4_attrs), {t_w2}, {});
@@ -297,7 +297,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       RepartitionAttrs p5_attrs = RepartitionAttrs{
           /*repartition_dim=*/ff_dim_t{0_n},
-          /*repartition_degree=*/2_n,
+          /*repartition_degree=*/2_p,
       };
       ParallelLayerAddedResult p5_added =
           add_parallel_layer(pcg, make_layer_attrs(p5_attrs), {t_p4}, {});
diff --git a/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc b/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc
index f320e45d06..c3c83dd6b8 100644
--- a/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc
+++ b/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc
@@ -38,9 +38,9 @@ namespace FlexFlow {
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("task_simulator_estimate_forward_pass_time") {
     MachineSpecification machine_spec =
-        MachineSpecification{/*num_nodes=*/3_n,
-                             /*num_cpus_per_node=*/3_n,
-                             /*num_gpus_per_node=*/3_n,
+        MachineSpecification{/*num_nodes=*/3_p,
+                             /*num_cpus_per_node=*/3_p,
+                             /*num_gpus_per_node=*/3_p,
                              /*inter_node_bandwidth=*/1.0f,
                              /*intra_node_bandwidth=*/1.0f};
 
@@ -48,9 +48,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       ParallelComputationGraphBuilder b;
       TensorShape input_shape = TensorShape{
           TensorDims{
-              FFOrdered<nonnegative_int>{
-                  10_n,
-                  7_n,
+              FFOrdered{
+                  10_p,
+                  7_p,
               },
           },
           DataType::FLOAT,
@@ -62,13 +62,13 @@ TEST_SUITE(FF_TEST_SUITE) {
       parallel_layer_guid_t layer1 = get_source_layer(tensor1);
 
       std::vector<MachineViewDimension> dims = {
-          MachineViewDimension{stride_t{1_n},
+          MachineViewDimension{stride_t{1_p},
                                MachineSpecificationDimension::INTER_NODE},
-          MachineViewDimension{stride_t{1_n},
+          MachineViewDimension{stride_t{1_p},
                                MachineSpecificationDimension::INTER_NODE},
-          MachineViewDimension{stride_t{1_n},
+          MachineViewDimension{stride_t{1_p},
                                MachineSpecificationDimension::INTER_NODE},
-          MachineViewDimension{stride_t{1_n},
+          MachineViewDimension{stride_t{1_p},
                                MachineSpecificationDimension::INTER_NODE},
       };
       ParallelComputationGraph pcg = b.pcg;
@@ -127,9 +127,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       TensorShape input_shape = TensorShape{
           TensorDims{
-              FFOrdered<nonnegative_int>{
-                  10_n,
-                  1_n,
+              FFOrdered{
+                  10_p,
+                  1_p,
               },
           },
           DataType::FLOAT,
@@ -147,13 +147,13 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       ParallelComputationGraph pcg = b.pcg;
       std::vector<MachineViewDimension> dims = {
-          MachineViewDimension{stride_t{1_n},
+          MachineViewDimension{stride_t{1_p},
                                MachineSpecificationDimension::INTER_NODE},
-          MachineViewDimension{stride_t{1_n},
+          MachineViewDimension{stride_t{1_p},
                                MachineSpecificationDimension::INTER_NODE},
-          MachineViewDimension{stride_t{1_n},
+          MachineViewDimension{stride_t{1_p},
                                MachineSpecificationDimension::INTER_NODE},
-          MachineViewDimension{stride_t{1_n},
+          MachineViewDimension{stride_t{1_p},
                                MachineSpecificationDimension::INTER_NODE},
       };
 
diff --git a/lib/compiler/test/src/graph_optimize_state.cc b/lib/compiler/test/src/graph_optimize_state.cc
index 5c00ce1558..e7060ef421 100644
--- a/lib/compiler/test/src/graph_optimize_state.cc
+++ b/lib/compiler/test/src/graph_optimize_state.cc
@@ -8,25 +8,13 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("GraphOptimizeState::operator==") {
     TensorShape input_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
-                32_n,
-                16_n,
+            FFOrdered{
+                32_p,
+                16_p,
             },
         },
         DataType::FLOAT,
     };
-    // ParallelTensorShape input_shape =
-    //     ParallelTensorShape{ParallelTensorDims{
-    //                             FFOrdered<ShardParallelDim>{
-    //                                 ShardParallelDim{32_n, 2_n},
-    //                                 ShardParallelDim{16_n, 1_n},
-    //                             },
-    //                             ReplicaParallelDimSet{
-    //                                 SumDegree{1_n},
-    //                                 DiscardCopyDegree{1_n},
-    //                             },
-    //                         },
-    //                         DataType::FLOAT};
 
     // `machine_mapping` is determined by the PCG and the device mapping
     // algorithm, and `runtime` is determined by the PCG and the device mapping,
@@ -43,7 +31,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           builder.create_input_tensor(input_shape, "input0");
       parallel_tensor_guid_t dense0 =
           builder.dense(/*input=*/input0,
-                        /*outDim=*/8_n,
+                        /*outDim=*/8_p,
                         /*activation=*/Activation::RELU,
                         /*use_bias=*/true,
                         /*data_type=*/DataType::FLOAT,
@@ -53,7 +41,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       parallel_tensor_guid_t dense1 =
           builder.dense(/*input=*/dense0,
-                        /*outDim=*/4_n,
+                        /*outDim=*/4_p,
                         /*activation=*/Activation::RELU,
                         /*use_bias=*/true,
                         /*data_type=*/DataType::FLOAT,
@@ -89,7 +77,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           builder_.create_input_tensor(input_shape, "input0");
       parallel_tensor_guid_t dense0_ =
           builder_.dense(/*input=*/input0_,
-                         /*outDim=*/8_n,
+                         /*outDim=*/8_p,
                          /*activation=*/Activation::RELU,
                          /*use_bias=*/true,
                          /*data_type=*/DataType::FLOAT,
diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h
index 431facd6c1..c24695298b 100644
--- a/lib/kernels/include/kernels/accessor.h
+++ b/lib/kernels/include/kernels/accessor.h
@@ -7,7 +7,6 @@
 #include "op-attrs/datatype.h"
 #include "pcg/device_type.dtg.h"
 #include "utils/containers/transform.h"
-#include "utils/required.h"
 #include <libassert/assert.hpp>
 
 namespace FlexFlow {
@@ -154,8 +153,6 @@ class GenericTensorAccessorW {
 std::string format_as(GenericTensorAccessorW const &);
 std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &);
 
-static_assert(is_fmtable<req<DataType> const &>::value, "");
-
 template <DataType DT>
 typename data_type_enum_to_class<DT>::type *
     get(GenericTensorAccessorW const &a) {
@@ -245,6 +242,14 @@ std::pair<ArrayShape, DataType>
 void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor,
                                     GenericTensorAccessorR const &src_accessor);
 
+template <DataType DT>
+real_type_t<DT> accessor_get_only_value(GenericTensorAccessorR const &acc) {
+  ASSERT(get_num_elements(acc.shape) == 1);
+  ASSERT(acc.data_type == DT);
+
+  return *static_cast<real_type_t<DT> const *>(acc.ptr);
+}
+
 } // namespace FlexFlow
 
 namespace FlexFlow {
diff --git a/lib/kernels/include/kernels/array_coord.h b/lib/kernels/include/kernels/array_coord.h
index f739a3d707..84e68fa053 100644
--- a/lib/kernels/include/kernels/array_coord.h
+++ b/lib/kernels/include/kernels/array_coord.h
@@ -5,7 +5,7 @@
 
 namespace FlexFlow {
 
-ArrayCoord array_coord_drop_dims(ArrayCoord const &,
+ArrayCoord array_coord_drop_dims(ArrayCoord const &coord,
                                  std::function<bool(ff_dim_t)> const &should_drop_dim);
 
 } // namespace FlexFlow
diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h
index 25ef8116f2..355b6e5bca 100644
--- a/lib/kernels/include/kernels/array_shape.h
+++ b/lib/kernels/include/kernels/array_shape.h
@@ -4,7 +4,7 @@
 #include "kernels/array_coord.dtg.h"
 #include "kernels/legion_dim.h"
 #include "op-attrs/tensor_shape.dtg.h"
-#include "utils/nonnegative_int/nonnegative_int.h"
+#include "utils/positive_int/positive_int.h"
 #include "utils/stack_vector/stack_vector.h"
 #include "utils/visitable.h"
 #include <cstddef>
@@ -16,25 +16,15 @@ namespace FlexFlow {
 struct ArrayShape {
 public:
   ArrayShape() = delete;
-  explicit ArrayShape(LegionOrdered<nonnegative_int> const &dims);
+  explicit ArrayShape(LegionOrdered<positive_int> const &dims);
 
-  /**
-   * @brief Alias of ArrayShape::num_elements for compatibility with
-   * Legion::Domain
-   */
-  nonnegative_int get_volume() const;
+  positive_int num_elements() const;
 
-  /**
-   * @brief Alias of ArrayShape::num_dims for compatibility with Legion::Domain
-   */
-  nonnegative_int get_dim() const;
-
-  nonnegative_int num_elements() const;
   nonnegative_int num_dims() const;
 
-  nonnegative_int operator[](legion_dim_t) const;
-  nonnegative_int at(legion_dim_t) const;
-  nonnegative_int at(ff_dim_t) const;
+  positive_int operator[](legion_dim_t) const;
+  positive_int at(legion_dim_t) const;
+  positive_int at(ff_dim_t) const;
 
   bool operator==(ArrayShape const &) const;
   bool operator!=(ArrayShape const &) const;
@@ -42,8 +32,8 @@ struct ArrayShape {
   legion_dim_t last_idx() const;
   legion_dim_t neg_idx(int) const;
 
-  std::optional<nonnegative_int> at_maybe(legion_dim_t) const;
-  std::optional<nonnegative_int> at_maybe(ff_dim_t) const;
+  std::optional<positive_int> at_maybe(legion_dim_t) const;
+  std::optional<positive_int> at_maybe(ff_dim_t) const;
 
   ArrayShape sub_shape(ff_dim_t const &start,
                        std::optional<ff_dim_t> const &end) const;
@@ -52,7 +42,7 @@ struct ArrayShape {
                        std::optional<legion_dim_t> const &end) const;
 
 public:
-  LegionOrdered<nonnegative_int> dims;
+  LegionOrdered<positive_int> dims;
 
 private:
   std::tuple<decltype(dims) const &> tie() const;
@@ -63,13 +53,17 @@ struct ArrayShape {
 std::string format_as(ArrayShape const &);
 std::ostream &operator<<(std::ostream &, ArrayShape const &);
 
-nonnegative_int get_volume(ArrayShape const &);
+positive_int get_num_elements(ArrayShape const &);
 
 ArrayShape array_shape_from_tensor_shape(TensorShape const &);
 TensorShape get_tensor_shape(ArrayShape const &, DataType);
 
+std::unordered_set<ff_dim_t> get_ff_dim_t_set(ArrayShape const &);
 std::unordered_set<ArrayCoord> get_array_coord_set(ArrayShape const &);
 
+ArrayShape array_shape_drop_dims(ArrayShape const &shape,
+                                 std::function<bool(ff_dim_t)> const &should_drop_dim);
+
 } // namespace FlexFlow
 
 namespace std {
diff --git a/lib/kernels/include/kernels/create_accessor_with_contents.h b/lib/kernels/include/kernels/create_accessor_with_contents.h
index fc07d432b2..966a7a30ad 100644
--- a/lib/kernels/include/kernels/create_accessor_with_contents.h
+++ b/lib/kernels/include/kernels/create_accessor_with_contents.h
@@ -5,6 +5,7 @@
 #include "kernels/allocation.h"
 #include "kernels/local_cpu_allocator.h"
 #include "utils/containers/require_all_same1.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
 
 namespace FlexFlow {
 
@@ -12,8 +13,7 @@ template <typename T>
 GenericTensorAccessorW
     create_1d_accessor_w_with_contents(std::vector<T> const &contents,
                                        Allocator &allocator) {
-  nonnegative_int ncols = num_elements(contents);
-  ASSERT(ncols > 0);
+  positive_int ncols = positive_int{num_elements(contents)};
 
   TensorShape shape = TensorShape{
       TensorDims{FFOrdered{ncols}},
@@ -23,7 +23,7 @@ GenericTensorAccessorW
   Allocator cpu_allocator = create_local_cpu_memory_allocator();
   GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape);
 
-  for (nonnegative_int col_idx : nonnegative_range(ncols)) {
+  for (nonnegative_int col_idx : nonnegative_range(ncols.nonnegative_int_from_positive_int())) {
     cpu_accessor.at<type_to_data_type_enum_v<T>>(FFOrdered{col_idx}) =
         contents.at(col_idx.unwrap_nonnegative());
   }
@@ -38,14 +38,12 @@ GenericTensorAccessorW
 template <typename T>
 GenericTensorAccessorW create_2d_accessor_w_with_contents(
     std::vector<std::vector<T>> const &contents, Allocator &allocator) {
-  nonnegative_int nrows = num_elements(contents);
-  ASSERT(nrows > 0);
+  positive_int nrows = positive_int{num_elements(contents)};
 
-  nonnegative_int ncols = throw_if_unexpected(
+  positive_int ncols = throw_if_unexpected(
       require_all_same1(transform(contents, [](std::vector<T> const &row) {
-        return num_elements(row);
+        return positive_int{num_elements(row)};
       })));
-  ASSERT(ncols > 0);
 
   TensorShape shape = TensorShape{
       TensorDims{FFOrdered{nrows, ncols}},
@@ -55,8 +53,8 @@ GenericTensorAccessorW create_2d_accessor_w_with_contents(
   Allocator cpu_allocator = create_local_cpu_memory_allocator();
   GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape);
 
-  for (nonnegative_int row_idx : nonnegative_range(nrows)) {
-    for (nonnegative_int col_idx : nonnegative_range(ncols)) {
+  for (nonnegative_int row_idx : nonnegative_range(nrows.nonnegative_int_from_positive_int())) {
+    for (nonnegative_int col_idx : nonnegative_range(ncols.nonnegative_int_from_positive_int())) {
       cpu_accessor.at<type_to_data_type_enum_v<T>>(FFOrdered{row_idx, col_idx}) =
           contents.at(row_idx.unwrap_nonnegative())
               .at(col_idx.unwrap_nonnegative());
@@ -74,23 +72,20 @@ template <typename T>
 GenericTensorAccessorW create_3d_accessor_w_with_contents(
     std::vector<std::vector<std::vector<T>>> const &contents,
     Allocator &allocator) {
-  nonnegative_int dim0_size = num_elements(contents);
-  ASSERT(dim0_size > 0);
+  positive_int dim0_size = positive_int{num_elements(contents)};
 
-  nonnegative_int dim1_size = throw_if_unexpected(require_all_same1(
+  positive_int dim1_size = throw_if_unexpected(require_all_same1(
       transform(contents, [](std::vector<std::vector<T>> const &m) {
-        return num_elements(m);
+        return positive_int{num_elements(m)};
       })));
-  ASSERT(dim1_size > 0);
 
-  nonnegative_int dim2_size = throw_if_unexpected(require_all_same1(
+  positive_int dim2_size = throw_if_unexpected(require_all_same1(
       transform(contents, [](std::vector<std::vector<T>> const &m) {
         return throw_if_unexpected(
             require_all_same1(transform(m, [](std::vector<T> const &vec) {
-              return num_elements(vec);
+              return positive_int{num_elements(vec)};
             })));
       })));
-  ASSERT(dim2_size > 0);
 
   TensorShape shape = TensorShape{
       TensorDims{FFOrdered{dim0_size, dim1_size, dim2_size}},
@@ -100,9 +95,9 @@ GenericTensorAccessorW create_3d_accessor_w_with_contents(
   Allocator cpu_allocator = create_local_cpu_memory_allocator();
   GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape);
 
-  for (nonnegative_int dim0_idx : nonnegative_range(dim0_size)) {
-    for (nonnegative_int dim1_idx : nonnegative_range(dim1_size)) {
-      for (nonnegative_int dim2_idx : nonnegative_range(dim2_size)) {
+  for (nonnegative_int dim0_idx : nonnegative_range(dim0_size.nonnegative_int_from_positive_int())) {
+    for (nonnegative_int dim1_idx : nonnegative_range(dim1_size.nonnegative_int_from_positive_int())) {
+      for (nonnegative_int dim2_idx : nonnegative_range(dim2_size.nonnegative_int_from_positive_int())) {
         cpu_accessor.at<type_to_data_type_enum_v<T>>(
             FFOrdered{dim0_idx, dim1_idx, dim2_idx}) =
             contents.at(dim0_idx.unwrap_nonnegative())
@@ -123,35 +118,31 @@ template <typename T>
 GenericTensorAccessorW create_4d_accessor_w_with_contents(
     std::vector<std::vector<std::vector<std::vector<T>>>> const &contents,
     Allocator &allocator) {
-  nonnegative_int dim0_size = num_elements(contents);
-  ASSERT(dim0_size > 0);
+  positive_int dim0_size = positive_int{num_elements(contents)};
 
-  nonnegative_int dim1_size = throw_if_unexpected(require_all_same1(transform(
+  positive_int dim1_size = throw_if_unexpected(require_all_same1(transform(
       contents, [](std::vector<std::vector<std::vector<T>>> const &t) {
-        return num_elements(t);
+        return positive_int{num_elements(t)};
       })));
-  ASSERT(dim1_size > 0);
 
-  nonnegative_int dim2_size = throw_if_unexpected(require_all_same1(transform(
+  positive_int dim2_size = throw_if_unexpected(require_all_same1(transform(
       contents, [](std::vector<std::vector<std::vector<T>>> const &m) {
         return throw_if_unexpected(require_all_same1(
             transform(m, [](std::vector<std::vector<T>> const &vec) {
-              return num_elements(vec);
+              return positive_int{num_elements(vec)};
             })));
       })));
-  ASSERT(dim2_size > 0);
 
-  nonnegative_int dim3_size = throw_if_unexpected(require_all_same1(transform(
+  positive_int dim3_size = throw_if_unexpected(require_all_same1(transform(
       contents, [](std::vector<std::vector<std::vector<T>>> const &t) {
         return throw_if_unexpected(require_all_same1(
             transform(t, [](std::vector<std::vector<T>> const &mat) {
               return throw_if_unexpected(require_all_same1(
                   transform(mat, [](std::vector<T> const &vec) {
-                    return num_elements(vec);
+                    return positive_int{num_elements(vec)};
                   })));
             })));
       })));
-  ASSERT(dim3_size > 0);
 
   TensorShape shape = TensorShape{
       TensorDims{FFOrdered{dim0_size, dim1_size, dim2_size, dim3_size}},
@@ -160,10 +151,10 @@ GenericTensorAccessorW create_4d_accessor_w_with_contents(
 
   GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
 
-  for (nonnegative_int dim0_idx : nonnegative_range(dim0_size)) {
-    for (nonnegative_int dim1_idx : nonnegative_range(dim1_size)) {
-      for (nonnegative_int dim2_idx : nonnegative_range(dim2_size)) {
-        for (nonnegative_int dim3_idx : nonnegative_range(dim3_size)) {
+  for (nonnegative_int dim0_idx : nonnegative_range(dim0_size.nonnegative_int_from_positive_int())) {
+    for (nonnegative_int dim1_idx : nonnegative_range(dim1_size.nonnegative_int_from_positive_int())) {
+      for (nonnegative_int dim2_idx : nonnegative_range(dim2_size.nonnegative_int_from_positive_int())) {
+        for (nonnegative_int dim3_idx : nonnegative_range(dim3_size.nonnegative_int_from_positive_int())) {
           accessor.at<type_to_data_type_enum_v<T>>(
               FFOrdered{dim0_idx, dim1_idx, dim2_idx, dim3_idx}) =
               contents.at(dim0_idx.unwrap_nonnegative())
diff --git a/lib/kernels/include/kernels/fill_tensor_accessor.h b/lib/kernels/include/kernels/fill_tensor_accessor.h
new file mode 100644
index 0000000000..8db63f5a2d
--- /dev/null
+++ b/lib/kernels/include/kernels/fill_tensor_accessor.h
@@ -0,0 +1,22 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FILL_TENSOR_ACCESSOR_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FILL_TENSOR_ACCESSOR_H
+
+#include "kernels/accessor.h"
+#include "kernels/allocation.h"
+#include "op-attrs/datatype_value.dtg.h"
+
+namespace FlexFlow {
+
+void fill_tensor_accessor(GenericTensorAccessorW &, DataTypeValue val);
+
+GenericTensorAccessorW create_accessor_w_filled_with(TensorShape const &shape,
+                                                     DataTypeValue val,
+                                                     Allocator const &allocator);
+
+GenericTensorAccessorR create_accessor_r_filled_with(TensorShape const &shape,
+                                                     DataTypeValue val,
+                                                     Allocator const &allocator);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/legion_dim.h b/lib/kernels/include/kernels/legion_dim.h
index 9a47c8a0fe..63c6ddb3c6 100644
--- a/lib/kernels/include/kernels/legion_dim.h
+++ b/lib/kernels/include/kernels/legion_dim.h
@@ -7,8 +7,9 @@
 #include "op-attrs/ff_ordered/ff_ordered.h"
 #include "utils/containers/set_of.h"
 #include "utils/containers/transform.h"
-#include "utils/nonnegative_int/nonnegative_range.h"
+#include "utils/positive_int/positive_int.h"
 #include "utils/nonnegative_int/num_elements.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
 
 namespace FlexFlow {
 
diff --git a/lib/kernels/include/kernels/map_tensor_accessors.h b/lib/kernels/include/kernels/map_tensor_accessors.h
index 8447c60892..eed17cbb61 100644
--- a/lib/kernels/include/kernels/map_tensor_accessors.h
+++ b/lib/kernels/include/kernels/map_tensor_accessors.h
@@ -8,9 +8,32 @@
 #include "kernels/datatype_dispatch.h"
 #include "utils/containers/require_same.h"
 #include "utils/containers/require_all_same1.h"
+#include <type_traits>
 
 namespace FlexFlow {
 
+template <DataType DT>
+struct CPUMapTensorAccessorInPlace {
+  template <typename F>
+  void operator()(GenericTensorAccessorW &accessor,
+                  F &&f) {
+    ASSERT(accessor.device_type == DeviceType::CPU);
+
+    for (ArrayCoord const &coord : get_array_coord_set(accessor.shape)) {
+      accessor.at<DT>(coord.ff_ordered) 
+        = f(accessor.at<DT>(coord.ff_ordered));
+    }
+  }
+};
+
+template <typename F>
+void map_tensor_accessor_inplace(GenericTensorAccessorW &accessor,
+                                 F &&f) {
+  ASSERT(accessor.device_type == DeviceType::CPU);
+
+  DataTypeDispatch1<CPUMapTensorAccessorInPlace>{}(accessor.data_type, accessor, f);
+}
+
 template <DataType DT>
 struct CPUMapTensorAccessor {
   template <typename F>
@@ -23,7 +46,9 @@ struct CPUMapTensorAccessor {
     ASSERT(output.device_type == DeviceType::CPU);
 
     for (ArrayCoord const &coord : get_array_coord_set(shape)) {
-      output.at<DataType::BOOL>(coord.ff_ordered) 
+      output.at<
+        type_to_data_type_enum_v<std::invoke_result_t<F, real_type_t<DT>>>
+      >(coord.ff_ordered) 
         = f(input.at<DT>(coord.ff_ordered));
     }
   }
@@ -31,8 +56,8 @@ struct CPUMapTensorAccessor {
 
 template <typename F, typename Out = std::invoke_result_t<F, float>>
 GenericTensorAccessorW map_tensor_accessor(GenericTensorAccessorR const &input,
-                                           Allocator &output_allocator,
-                                           F &&f) {
+                                           F &&f,
+                                           Allocator &output_allocator) {
   Allocator cpu_allocator = create_local_cpu_memory_allocator();
   GenericTensorAccessorR input_cpu = copy_tensor_accessor_r_to_cpu_if_necessary(input, cpu_allocator);
 
@@ -43,9 +68,12 @@ GenericTensorAccessorW map_tensor_accessor(GenericTensorAccessorR const &input,
   return copy_tensor_accessor_w(output_cpu, output_allocator);
 }
 
-template <DataType DT>
+template <DataType DTL, DataType DTR>
 struct CPUMapTensorAccessors2 {
-  template <typename F, typename Out = std::invoke_result_t<F, float, float>>
+  template <
+    typename F, 
+    typename Out = std::invoke_result_t<F, real_type_t<DTL>, real_type_t<DTR>>
+  >
   void operator()(GenericTensorAccessorR const &lhs,
                   GenericTensorAccessorR const &rhs,
                   GenericTensorAccessorW &output,
@@ -63,26 +91,25 @@ struct CPUMapTensorAccessors2 {
 
     for (ArrayCoord const &coord : get_array_coord_set(shape)) {
       output.at<type_to_data_type_enum_v<Out>>(coord.ff_ordered) 
-        = f(lhs.at<DT>(coord.ff_ordered), rhs.at<DT>(coord.ff_ordered));
+        = f(lhs.at<DTL>(coord.ff_ordered), rhs.at<DTR>(coord.ff_ordered));
     }
   }
 };
 
-template <typename F, typename Out = std::invoke_result_t<F, float, float>>
+template <typename F>
 GenericTensorAccessorW map_tensor_accessors2(GenericTensorAccessorR const &lhs,
                                              GenericTensorAccessorR const &rhs,
-                                             Allocator &output_allocator,
-                                             F &&f) {
+                                             DataType output_data_type,
+                                             F &&f,
+                                             Allocator &output_allocator) {
   ArrayShape shape = require_same(lhs.shape, rhs.shape);
-  DataType input_data_type = require_same(lhs.data_type, rhs.data_type);
 
   Allocator cpu_allocator = create_local_cpu_memory_allocator();
   GenericTensorAccessorR lhs_cpu = copy_tensor_accessor_r_to_cpu_if_necessary(lhs, cpu_allocator);
   GenericTensorAccessorR rhs_cpu = copy_tensor_accessor_r_to_cpu_if_necessary(rhs, cpu_allocator);
-  DataType output_data_type = type_to_data_type_enum_v<Out>;
   GenericTensorAccessorW output_cpu = cpu_allocator.allocate_tensor(get_tensor_shape(shape, output_data_type));
 
-  DataTypeDispatch1<CPUMapTensorAccessors2>{}(input_data_type, lhs_cpu, rhs_cpu, output_cpu, f);
+  DataTypeDispatch2<CPUMapTensorAccessors2>{}(lhs.data_type, rhs.data_type, lhs_cpu, rhs_cpu, output_cpu, f);
 
   return copy_tensor_accessor_w(output_cpu, output_allocator);
 }
diff --git a/lib/kernels/include/kernels/optimizer_kernels.h b/lib/kernels/include/kernels/optimizer_kernels.h
index 39284b4a6f..51e6f8640f 100644
--- a/lib/kernels/include/kernels/optimizer_kernels.h
+++ b/lib/kernels/include/kernels/optimizer_kernels.h
@@ -43,18 +43,20 @@ void adam_ps_update_task_gpu(ffStream_t,
                              float *adam_v_ptr,
                              float *adam_m_ptr);
 
+#ifdef FF_USE_NCCL
 void adam_nccl_update_task_gpu(ffStream_t,
                                float alpha_t,
                                float beta1,
                                float beta2,
                                float weight_decay,
                                float epsilon,
-                               size_t size,
                                PerDeviceFFHandle const &,
                                float const *weight_grad_ptr,
+                               size_t size,
                                float *weight_ptr,
                                float *adam_v_ptr,
                                float *adam_m_ptr);
+#endif
 
 } // namespace FlexFlow
 
diff --git a/lib/kernels/include/kernels/reduce_tensor_accessor.h b/lib/kernels/include/kernels/reduce_tensor_accessor.h
new file mode 100644
index 0000000000..4be375299f
--- /dev/null
+++ b/lib/kernels/include/kernels/reduce_tensor_accessor.h
@@ -0,0 +1,88 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REDUCE_TENSOR_ACCESSOR_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REDUCE_TENSOR_ACCESSOR_H
+
+#include "kernels/accessor.h"
+#include "kernels/allocation.h"
+#include "kernels/array_coord.h"
+#include "utils/containers/contains.h"
+#include "utils/containers/sorted.h"
+#include "utils/containers/group_by.h"
+#include "utils/containers/transform.h"
+#include "utils/containers/foldl1.h"
+#include "utils/containers/foldr1.h"
+#include "kernels/local_cpu_allocator.h"
+#include "kernels/copy_tensor_accessor.h"
+#include "kernels/datatype_dispatch.h"
+
+namespace FlexFlow {
+
+template <DataType DT>
+struct CPUReduceTensorAccessorInDims {
+  template <typename F>
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW &output,
+                  std::unordered_set<ff_dim_t> const &dims_to_reduce,
+                  F &&f) {
+    using T = real_type_t<DT>;
+
+    ASSERT(input.device_type == DeviceType::CPU);
+    ASSERT(output.device_type == DeviceType::CPU);
+
+    auto should_drop_dim = [&](ff_dim_t dim) -> bool {
+      return contains(dims_to_reduce, dim);
+    };
+
+    std::unordered_map<ArrayCoord, std::unordered_set<ArrayCoord>> output_coord_from_input_coord
+      = group_by(get_array_coord_set(input.shape), 
+                 [&](ArrayCoord const &input_coord) { return array_coord_drop_dims(input_coord, should_drop_dim); });
+
+    for (auto const &[output_coord, input_coords] : output_coord_from_input_coord) {
+      std::vector<T> input_values = transform(sorted(input_coords),
+                                                 [&](ArrayCoord const &input_coord) -> T { 
+                                                   return input.at<DT>(input_coord.ff_ordered);
+                                                 });
+
+      T result = foldl1(input_values, f);
+      ASSERT(result == foldr1(input_values, [&](T const &accum, T const &elem) { return f(elem, accum); }));
+
+      output.at<DT>(output_coord.ff_ordered) = result;
+    }
+  }
+};
+
+template <typename F>
+GenericTensorAccessorW reduce_tensor_accessor_in_dims(
+  GenericTensorAccessorR const &input,
+  std::unordered_set<ff_dim_t> const &dims,
+  Allocator &output_allocator,
+  F &&f) {
+
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorR input_cpu = copy_tensor_accessor_r_to_cpu_if_necessary(input, cpu_allocator);
+
+  auto should_drop_dim = [&](ff_dim_t dim) -> bool {
+    return contains(dims, dim);
+  };
+
+  ArrayShape reduced_shape = array_shape_drop_dims(input.shape, should_drop_dim);
+  GenericTensorAccessorW output_cpu = cpu_allocator.allocate_tensor(get_tensor_shape(reduced_shape, input.data_type));
+
+  DataTypeDispatch1<CPUReduceTensorAccessorInDims>{}(input_cpu.data_type, input_cpu, output_cpu, dims, f);
+
+  return copy_tensor_accessor_w(output_cpu, output_allocator);
+}
+
+template <DataType DT, typename F>
+real_type_t<DT> reduce_tensor_accessor_in_all_dims(GenericTensorAccessorR const &input,
+                                                      F &&f) {
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+  std::unordered_set<ff_dim_t> input_dims = get_ff_dim_t_set(input.shape);
+  GenericTensorAccessorW reduced = reduce_tensor_accessor_in_dims(input, input_dims, cpu_allocator, f);
+
+  return accessor_get_only_value<DT>(reduced);
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/reduce_tensor_accessors.h b/lib/kernels/include/kernels/reduce_tensor_accessors.h
deleted file mode 100644
index c80c41778f..0000000000
--- a/lib/kernels/include/kernels/reduce_tensor_accessors.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REDUCE_TENSOR_ACCESSORS_H
-#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REDUCE_TENSOR_ACCESSORS_H
-
-#include "kernels/accessor.h"
-#include "kenrels/allocation.h"
-
-namespace FlexFlow {
-
-
-
-template <typename DTIn, typename DTOut>
-struct CPUReduceTensorAccessorInDims {
-  template <typename F>
-  void operator()(GenericTensorAccessorR const &input,
-                  GenericTensorAccessorW &output,
-                  std::unordered_set<ff_dim_t> const &dims_to_reduce,
-                  F &&f) {
-    
-    ASSERT(input.device_type == DeviceType::CPU);
-    ASSERT(output.device_type == DeviceType::CPU);
-
-    for (ArrayCoord const &coord : get_array_coord_set(input.shape)) {
-      output.at<type_to_data_type_enum_v<DTOut>>(coord)
-    }
-  }
-};
-
-template <typename F>
-GenericTensorAccessorW reduce_tensor_accessor_in_dims(std::unordered_set<ff_dim_t> const &dims,
-                                                      F &&f) {
-  
-}
-
-GenericTensorAccessorW reduce_tensor_accessor_all(GenericTensorAcessorR const &input,
-                                                  Allocator &allocator);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/kernels/include/kernels/reverse_kernels_params.struct.toml b/lib/kernels/include/kernels/reverse_kernels_params.struct.toml
index a5dbd750bc..1689594491 100644
--- a/lib/kernels/include/kernels/reverse_kernels_params.struct.toml
+++ b/lib/kernels/include/kernels/reverse_kernels_params.struct.toml
@@ -8,21 +8,21 @@ features = [
 ]
 
 includes = [
-  "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 [[fields]]
 name = "num_out_blks"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "reverse_dim_size"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "in_blk_size"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "out_size"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
diff --git a/lib/kernels/include/kernels/tensor_accessor_reductions.h b/lib/kernels/include/kernels/tensor_accessor_reductions.h
new file mode 100644
index 0000000000..03502b6943
--- /dev/null
+++ b/lib/kernels/include/kernels/tensor_accessor_reductions.h
@@ -0,0 +1,13 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TENSOR_ACCESSOR_REDUCTIONS_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TENSOR_ACCESSOR_REDUCTIONS_H
+
+#include "kernels/accessor.h"
+
+namespace FlexFlow {
+
+bool tensor_accessor_all(GenericTensorAccessorR const &);
+bool tensor_accessor_any(GenericTensorAccessorR const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/src/cpu/ops/cast_kernels.cc b/lib/kernels/src/cpu/ops/cast_kernels.cc
index cdd57b8947..08a98f165b 100644
--- a/lib/kernels/src/cpu/ops/cast_kernels.cc
+++ b/lib/kernels/src/cpu/ops/cast_kernels.cc
@@ -21,7 +21,7 @@ template <DataType IDT, DataType ODT>
 struct CPUForwardKernel {
   void operator()(GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &output) {
-    size_t volume = input.shape.get_volume().unwrap_nonnegative();
+    size_t volume = input.shape.num_elements().int_from_positive_int();
     cpu_cast_forward(input.get<IDT>(), output.get<ODT>(), volume);
   }
 };
@@ -30,7 +30,7 @@ template <DataType IDT, DataType ODT>
 struct CPUBackwardKernel {
   void operator()(GenericTensorAccessorR const &output,
                   GenericTensorAccessorW const &input) {
-    size_t volume = output.shape.get_volume().unwrap_nonnegative();
+    size_t volume = output.shape.num_elements().int_from_positive_int();
     cpu_cast_backward(
         output.get<IDT>(), input.get<ODT>(), volume, cast_to<ODT>(1.0f));
   }
diff --git a/lib/kernels/src/cpu/ops/combine_kernels.cc b/lib/kernels/src/cpu/ops/combine_kernels.cc
index 577984f21a..557f523f17 100644
--- a/lib/kernels/src/cpu/ops/combine_kernels.cc
+++ b/lib/kernels/src/cpu/ops/combine_kernels.cc
@@ -9,8 +9,8 @@ struct CPUForwardKernel {
                   GenericTensorAccessorW const &output) {
     memcpy(output.get<DT>(),
            input.get<DT>(),
-           input.shape.get_volume().unwrap_nonnegative() *
-               size_of_datatype(DT).unwrap_nonnegative());
+           input.shape.num_elements().int_from_positive_int() *
+               size_of_datatype(DT).int_from_positive_int());
   }
 };
 
@@ -18,7 +18,7 @@ template <DataType DT>
 struct CPUBackwardKernel {
   void operator()(GenericTensorAccessorR const &output_grad,
                   GenericTensorAccessorW const &input_grad) {
-    size_t num_elements = output_grad.shape.get_volume().unwrap_nonnegative();
+    size_t num_elements = output_grad.shape.num_elements().int_from_positive_int();
     for (int i = 0; i < num_elements; ++i) {
       input_grad.get<DT>()[i] += output_grad.get<DT>()[i];
     }
diff --git a/lib/kernels/src/cpu/ops/initializer_kernels.cc b/lib/kernels/src/cpu/ops/initializer_kernels.cc
index 91f4f46ef8..c7f43b5762 100644
--- a/lib/kernels/src/cpu/ops/initializer_kernels.cc
+++ b/lib/kernels/src/cpu/ops/initializer_kernels.cc
@@ -9,7 +9,7 @@ template <DataType DT>
 struct ZeroInitKernel {
   void operator()(GenericTensorAccessorW const &tensor) const {
     auto arr = get<DT>(tensor);
-    for (size_t i = 0; i < get_volume(tensor.shape); i++) {
+    for (size_t i = 0; i < get_num_elements(tensor.shape); i++) {
       arr[i] = 0.0f;
     }
   }
@@ -25,7 +25,7 @@ struct ConstantInitKernel {
                   DataTypeValue value) const {
     auto arr = get<DT>(tensor);
     auto unwrapped_value = value.get<real_type_t<DT>>();
-    for (size_t i = 0; i < get_volume(tensor.shape); i++) {
+    for (size_t i = 0; i < get_num_elements(tensor.shape); i++) {
       arr[i] = unwrapped_value;
     }
   }
diff --git a/lib/kernels/src/cpu/ops/replicate_kernels.cc b/lib/kernels/src/cpu/ops/replicate_kernels.cc
index 798a4ea8c7..d97a274d80 100644
--- a/lib/kernels/src/cpu/ops/replicate_kernels.cc
+++ b/lib/kernels/src/cpu/ops/replicate_kernels.cc
@@ -1,5 +1,6 @@
 #include "kernels/datatype_dispatch.h"
 #include "kernels/replicate_kernels_cpu.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
 
 namespace FlexFlow::Kernels::Replicate {
 
@@ -9,8 +10,8 @@ struct CPUForwardKernel {
                   GenericTensorAccessorW &output) {
     memcpy(output.get<DT>(),
            input.get<DT>(),
-           input.shape.num_elements().unwrap_nonnegative() *
-               size_of_datatype(DT).unwrap_nonnegative());
+           input.shape.num_elements().int_from_positive_int() *
+               size_of_datatype(DT).int_from_positive_int());
   }
 };
 
@@ -18,11 +19,12 @@ template <DataType DT>
 struct CPUBackwardKernel {
   void operator()(GenericTensorAccessorR const &output,
                   GenericTensorAccessorW &input,
-                  nonnegative_int num_elements,
+                  positive_int num_elements,
                   nonnegative_int num_replicas) {
     using T = real_type_t<DT>;
 
-    for (nonnegative_int i : nonnegative_range(num_elements)) {
+    for (nonnegative_int i : 
+           nonnegative_range(num_elements.nonnegative_int_from_positive_int())) {
       T cur_sum = 0;
       for (nonnegative_int replica_idx : nonnegative_range(num_replicas)) {
         cur_sum += output.at<DT>(LegionOrdered{replica_idx, i});
@@ -40,7 +42,7 @@ void cpu_forward_kernel(GenericTensorAccessorR const &input,
 void cpu_backward_kernel(GenericTensorAccessorR const &output,
                          GenericTensorAccessorW &input,
                          size_t num_replicas) {
-  nonnegative_int num_elements = input.shape.num_elements();
+  positive_int num_elements = input.shape.num_elements();
   DataTypeDispatch1<CPUBackwardKernel>{}(input.data_type,
                                          output,
                                          input,
diff --git a/lib/kernels/src/cpu/ops/reverse_kernels.cc b/lib/kernels/src/cpu/ops/reverse_kernels.cc
index 4d9eb8cc09..212a52881a 100644
--- a/lib/kernels/src/cpu/ops/reverse_kernels.cc
+++ b/lib/kernels/src/cpu/ops/reverse_kernels.cc
@@ -9,7 +9,7 @@ struct CPUReverseForwardKernel {
   void operator()(GenericTensorAccessorR const &input,
                   GenericTensorAccessorW &output,
                   ReverseAttrs const &attrs) {
-    nonnegative_int reverse_axis_size = input.shape.at(attrs.axis);
+    positive_int reverse_axis_size = input.shape.at(attrs.axis);
 
     for (ArrayCoord const &input_coord : get_array_coord_set(input.shape)) {
       nonnegative_int input_reverse_axis_coord =
@@ -17,7 +17,7 @@ struct CPUReverseForwardKernel {
 
       ArrayCoord output_coord = input_coord;
       output_coord.ff_ordered.at(attrs.axis) =
-          nonnegative_int{reverse_axis_size.unwrap_nonnegative() -
+          nonnegative_int{reverse_axis_size.int_from_positive_int() -
                           input_reverse_axis_coord.unwrap_nonnegative() - 1};
 
       output.at<DT>(output_coord.ff_ordered) =
diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu
index 86b2d8a437..98faadf5ac 100644
--- a/lib/kernels/src/cuda/cuda_helper.cu
+++ b/lib/kernels/src/cuda/cuda_helper.cu
@@ -224,10 +224,10 @@ ffStatus_t
       tensor,
       CUDNN_TENSOR_NCHW,
       CUDNN_DATA_FLOAT,
-      shape.at_maybe(legion_dim_t{0_n}).value_or(1_n).unwrap_nonnegative(),
-      shape.at_maybe(legion_dim_t{1_n}).value_or(1_n).unwrap_nonnegative(),
-      shape.at_maybe(legion_dim_t{2_n}).value_or(1_n).unwrap_nonnegative(),
-      shape.at_maybe(legion_dim_t{3_n}).value_or(1_n).unwrap_nonnegative());
+      shape.at_maybe(legion_dim_t{0_n}).value_or(1_p).int_from_positive_int(),
+      shape.at_maybe(legion_dim_t{1_n}).value_or(1_p).int_from_positive_int(),
+      shape.at_maybe(legion_dim_t{2_n}).value_or(1_p).int_from_positive_int(),
+      shape.at_maybe(legion_dim_t{3_n}).value_or(1_p).int_from_positive_int());
 }
 
 cudnnDataType_t ff_to_cudnn_datatype(DataType type) {
diff --git a/lib/kernels/src/cuda/embedding_kernels.cu b/lib/kernels/src/cuda/embedding_kernels.cu
index cb84f0e777..a7e28c6297 100644
--- a/lib/kernels/src/cuda/embedding_kernels.cu
+++ b/lib/kernels/src/cuda/embedding_kernels.cu
@@ -343,7 +343,7 @@ struct ForwardKernel<DataType::INT32, DataType::FLOAT> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_forward_no_aggr<float>
-          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -354,7 +354,7 @@ struct ForwardKernel<DataType::INT32, DataType::FLOAT> {
     } else {
       assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
       embed_forward_with_aggr<float>
-          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -380,7 +380,7 @@ struct ForwardKernel<DataType::INT32, DataType::HALF> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_forward_no_aggr<half>
-          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -391,7 +391,7 @@ struct ForwardKernel<DataType::INT32, DataType::HALF> {
     } else {
       assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
       embed_forward_with_aggr<half>
-          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -417,7 +417,7 @@ struct ForwardKernel<DataType::INT32, DataType::DOUBLE> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_forward_no_aggr<double>
-          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -428,7 +428,7 @@ struct ForwardKernel<DataType::INT32, DataType::DOUBLE> {
     } else {
       assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
       embed_forward_with_aggr<double>
-          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -454,7 +454,7 @@ struct ForwardKernel<DataType::INT64, DataType::FLOAT> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_forward_no_aggr<float>
-          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -465,7 +465,7 @@ struct ForwardKernel<DataType::INT64, DataType::FLOAT> {
     } else {
       assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
       embed_forward_with_aggr<float>
-          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -491,7 +491,7 @@ struct ForwardKernel<DataType::INT64, DataType::HALF> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_forward_no_aggr<half>
-          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -502,7 +502,7 @@ struct ForwardKernel<DataType::INT64, DataType::HALF> {
     } else {
       assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
       embed_forward_with_aggr<half>
-          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -528,7 +528,7 @@ struct ForwardKernel<DataType::INT64, DataType::DOUBLE> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_forward_no_aggr<double>
-          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -539,7 +539,7 @@ struct ForwardKernel<DataType::INT64, DataType::DOUBLE> {
     } else {
       assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
       embed_forward_with_aggr<double>
-          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -580,7 +580,7 @@ struct BackwardKernel<DataType::INT32, DataType::FLOAT> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_backward_no_aggr<float>
-          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -590,7 +590,7 @@ struct BackwardKernel<DataType::INT32, DataType::FLOAT> {
                        batch_size);
     } else {
       embed_backward_with_aggr<float>
-          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -616,7 +616,7 @@ struct BackwardKernel<DataType::INT32, DataType::DOUBLE> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_backward_no_aggr<double>
-          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -626,7 +626,7 @@ struct BackwardKernel<DataType::INT32, DataType::DOUBLE> {
                        batch_size);
     } else {
       embed_backward_with_aggr<double>
-          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -652,7 +652,7 @@ struct BackwardKernel<DataType::INT32, DataType::HALF> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_backward_no_aggr<half>
-          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -662,7 +662,7 @@ struct BackwardKernel<DataType::INT32, DataType::HALF> {
                        batch_size);
     } else {
       embed_backward_with_aggr<half>
-          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -688,7 +688,7 @@ struct BackwardKernel<DataType::INT64, DataType::FLOAT> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_backward_no_aggr<float>
-          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -698,7 +698,7 @@ struct BackwardKernel<DataType::INT64, DataType::FLOAT> {
                        batch_size);
     } else {
       embed_backward_with_aggr<float>
-          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -724,7 +724,7 @@ struct BackwardKernel<DataType::INT64, DataType::DOUBLE> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_backward_no_aggr<double>
-          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -734,7 +734,7 @@ struct BackwardKernel<DataType::INT64, DataType::DOUBLE> {
                        batch_size);
     } else {
       embed_backward_with_aggr<double>
-          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -760,7 +760,7 @@ struct BackwardKernel<DataType::INT64, DataType::HALF> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_backward_no_aggr<half>
-          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -770,7 +770,7 @@ struct BackwardKernel<DataType::INT64, DataType::HALF> {
                        batch_size);
     } else {
       embed_backward_with_aggr<half>
-          <<<GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
diff --git a/lib/kernels/src/cuda/ops/cast_kernels.cu b/lib/kernels/src/cuda/ops/cast_kernels.cu
index f3ea6db660..3de6de9d5e 100644
--- a/lib/kernels/src/cuda/ops/cast_kernels.cu
+++ b/lib/kernels/src/cuda/ops/cast_kernels.cu
@@ -41,7 +41,7 @@ struct ForwardKernel {
   void operator()(ffStream_t stream,
                   GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &output) {
-    size_t volume = input.shape.get_volume().unwrap_nonnegative();
+    size_t volume = input.shape.num_elements().int_from_positive_int();
     cast_forward<<<GET_BLOCKS(volume), CUDA_NUM_THREADS, 0, stream>>>(
         input.get<IDT>(), output.get<ODT>(), volume);
   }
@@ -52,7 +52,7 @@ struct BackwardKernel {
   void operator()(ffStream_t stream,
                   GenericTensorAccessorR const &output,
                   GenericTensorAccessorW const &input) {
-    size_t volume = output.shape.get_volume().unwrap_nonnegative();
+    size_t volume = output.shape.num_elements().int_from_positive_int();
     cast_backward<<<GET_BLOCKS(volume), CUDA_NUM_THREADS, 0, stream>>>(
         output.get<IDT>(), input.get<ODT>(), volume, cast_to<ODT>(1.0f));
   }
diff --git a/lib/kernels/src/cuda/ops/combine_kernels.cu b/lib/kernels/src/cuda/ops/combine_kernels.cu
index 08cc343fd2..4920696756 100644
--- a/lib/kernels/src/cuda/ops/combine_kernels.cu
+++ b/lib/kernels/src/cuda/ops/combine_kernels.cu
@@ -29,8 +29,8 @@ struct ForwardKernel {
                   GenericTensorAccessorW const &output) {
     checkCUDA(cudaMemcpyAsync(output.get<DT>(),
                               input.get<DT>(),
-                              input.shape.get_volume().unwrap_nonnegative() *
-                                  size_of_datatype(DT).unwrap_nonnegative(),
+                              input.shape.num_elements().int_from_positive_int() *
+                                  size_of_datatype(DT).int_from_positive_int(),
                               cudaMemcpyDeviceToDevice,
                               stream));
   }
@@ -41,7 +41,7 @@ struct BackwardKernel {
   void operator()(ffStream_t stream,
                   GenericTensorAccessorR const &output_grad,
                   GenericTensorAccessorW const &input_grad) {
-    size_t num_elements = output_grad.shape.get_volume().unwrap_nonnegative();
+    size_t num_elements = output_grad.shape.num_elements().int_from_positive_int();
     add_kernel<real_type_t<DT>>
         <<<GET_BLOCKS(num_elements), CUDA_NUM_THREADS, 0, stream>>>(
             input_grad.get<DT>(), output_grad.get<DT>(), num_elements);
diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu
index 37dbbe12f8..e7f88bc258 100644
--- a/lib/kernels/src/cuda/ops/concat_kernels.cu
+++ b/lib/kernels/src/cuda/ops/concat_kernels.cu
@@ -30,10 +30,10 @@ void calc_blk_size(size_t &num_blocks,
   }
   blk_size = shape.sub_shape(legion_dim_t{0_n}, legion_axis)
                  .num_elements()
-                 .unwrap_nonnegative();
+                 .int_from_positive_int();
   num_blocks = shape.sub_shape(legion_axis, std::nullopt)
                    .num_elements()
-                   .unwrap_nonnegative();
+                   .int_from_positive_int();
 }
 
 void forward_kernel(cudaStream_t stream,
diff --git a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu
index 16db62a57f..6e446008ed 100644
--- a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu
+++ b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu
@@ -137,15 +137,15 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle,
   ffConvolutionBwdFilterAlgo_t bwdFilterAlgo;
   ffConvolutionBwdDataAlgo_t bwdDataAlgo;
 
-  int input_w = input.shape.at(legion_dim_t(0_n)).unwrap_nonnegative();
-  int input_h = input.shape.at(legion_dim_t(1_n)).unwrap_nonnegative();
-  int input_c = input.shape.at(legion_dim_t(2_n)).unwrap_nonnegative();
-  int input_n = input.shape.at(legion_dim_t(3_n)).unwrap_nonnegative();
+  int input_w = input.shape.at(legion_dim_t(0_n)).int_from_positive_int();
+  int input_h = input.shape.at(legion_dim_t(1_n)).int_from_positive_int();
+  int input_c = input.shape.at(legion_dim_t(2_n)).int_from_positive_int();
+  int input_n = input.shape.at(legion_dim_t(3_n)).int_from_positive_int();
 
-  int output_w = output.shape.at(legion_dim_t(0_n)).unwrap_nonnegative();
-  int output_h = output.shape.at(legion_dim_t(1_n)).unwrap_nonnegative();
-  int output_c = output.shape.at(legion_dim_t(2_n)).unwrap_nonnegative();
-  int output_n = output.shape.at(legion_dim_t(3_n)).unwrap_nonnegative();
+  int output_w = output.shape.at(legion_dim_t(0_n)).int_from_positive_int();
+  int output_h = output.shape.at(legion_dim_t(1_n)).int_from_positive_int();
+  int output_c = output.shape.at(legion_dim_t(2_n)).int_from_positive_int();
+  int output_n = output.shape.at(legion_dim_t(3_n)).int_from_positive_int();
 
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
   checkCUDNN(cudnnCreateTensorDescriptor(&biasTensor));
diff --git a/lib/kernels/src/cuda/ops/element_unary_kernels.cu b/lib/kernels/src/cuda/ops/element_unary_kernels.cu
index 218e74b939..21ac95c204 100644
--- a/lib/kernels/src/cuda/ops/element_unary_kernels.cu
+++ b/lib/kernels/src/cuda/ops/element_unary_kernels.cu
@@ -266,7 +266,7 @@ struct ForwardKernel {
                                         output.get<T>()));
     } else if (use_scalar(op_type)) {
       assert(scalar.has_value());
-      size_t num_elements = input.shape.num_elements().unwrap_nonnegative();
+      size_t num_elements = input.shape.num_elements().int_from_positive_int();
       elewise_scalar_unary_forward_kernel<real_type_t<T>>
           <<<GET_BLOCKS(num_elements), CUDA_NUM_THREADS, 0, stream>>>(
               num_elements,
@@ -275,7 +275,7 @@ struct ForwardKernel {
               input.get<T>(),
               output.get<T>());
     } else {
-      size_t num_elements = input.shape.num_elements().unwrap_nonnegative();
+      size_t num_elements = input.shape.num_elements().int_from_positive_int();
       elewise_unary_forward_kernel<real_type_t<T>>
           <<<GET_BLOCKS(num_elements), CUDA_NUM_THREADS, 0, stream>>>(
               num_elements, op_type, input.get<T>(), output.get<T>());
@@ -312,7 +312,7 @@ struct BackwardKernel {
                                          input_grad.get<T>()));
     } else if (use_scalar(op_type)) {
       assert(scalar.has_value());
-      size_t num_elements = input.shape.num_elements().unwrap_nonnegative();
+      size_t num_elements = input.shape.num_elements().int_from_positive_int();
       elewise_scalar_unary_backward_kernel<real_type_t<T>>
           <<<GET_BLOCKS(num_elements), CUDA_NUM_THREADS, 0, stream>>>(
               num_elements,
@@ -323,7 +323,7 @@ struct BackwardKernel {
               input.get<T>(),
               input_grad.get<T>());
     } else {
-      size_t num_elements = input.shape.num_elements().unwrap_nonnegative();
+      size_t num_elements = input.shape.num_elements().int_from_positive_int();
       elewise_unary_backward_kernel<real_type_t<T>>
           <<<GET_BLOCKS(num_elements), CUDA_NUM_THREADS, 0, stream>>>(
               num_elements,
diff --git a/lib/kernels/src/cuda/ops/flat_kernels.cu b/lib/kernels/src/cuda/ops/flat_kernels.cu
index 594a183ff0..9dee095071 100644
--- a/lib/kernels/src/cuda/ops/flat_kernels.cu
+++ b/lib/kernels/src/cuda/ops/flat_kernels.cu
@@ -27,7 +27,7 @@ void forward_kernel(cudaStream_t stream,
 
   checkCUDA(cudaMemcpyAsync(output_ptr,
                             input.get_float_ptr(),
-                            input.shape.num_elements().unwrap_nonnegative() *
+                            input.shape.num_elements().int_from_positive_int() *
                                 sizeof(float),
                             cudaMemcpyDeviceToDevice,
                             stream));
@@ -40,12 +40,12 @@ void backward_kernel(cudaStream_t stream,
 
   float alpha = 1.0f;
   apply_add_with_scale<float>
-      <<<GET_BLOCKS(input.shape.num_elements().unwrap_nonnegative()),
+      <<<GET_BLOCKS(input.shape.num_elements().int_from_positive_int()),
          CUDA_NUM_THREADS,
          0,
          stream>>>(input_grad_ptr,
                    output_grad_ptr,
-                   input.shape.num_elements().unwrap_nonnegative(),
+                   input.shape.num_elements().int_from_positive_int(),
                    alpha);
 }
 
diff --git a/lib/kernels/src/cuda/ops/gather_kernels.cu b/lib/kernels/src/cuda/ops/gather_kernels.cu
index 19e495a540..bee8f68eef 100644
--- a/lib/kernels/src/cuda/ops/gather_kernels.cu
+++ b/lib/kernels/src/cuda/ops/gather_kernels.cu
@@ -127,13 +127,13 @@ void forward_kernel(ffStream_t stream,
       output.shape
           .sub_shape(legion_dim_t{0_n}, add_to_legion_dim(m.legion_dim, 1))
           .num_elements()
-          .unwrap_nonnegative();
+          .int_from_positive_int();
   if (m.legion_dim.value == 0_n) {
     stride = 1;
   }
 
-  coord_t output_dim_size = output.shape.at(m.legion_dim).unwrap_nonnegative();
-  coord_t input_dim_size = input.shape.at(m.legion_dim).unwrap_nonnegative();
+  coord_t output_dim_size = output.shape.at(m.legion_dim).int_from_positive_int();
+  coord_t input_dim_size = input.shape.at(m.legion_dim).int_from_positive_int();
 
   assert(index.data_type == DataType::INT32 ||
          index.data_type == DataType::INT64);
@@ -144,7 +144,7 @@ void forward_kernel(ffStream_t stream,
       input,
       index,
       output,
-      output.shape.get_volume().unwrap_nonnegative(),
+      output.shape.num_elements().int_from_positive_int(),
       stride,
       input_dim_size,
       output_dim_size);
@@ -161,15 +161,15 @@ void backward_kernel(ffStream_t stream,
       output_grad.shape
           .sub_shape(legion_dim_t{0_n}, add_to_legion_dim(m.legion_dim, 1))
           .num_elements()
-          .unwrap_nonnegative();
+          .int_from_positive_int();
   if (m.legion_dim.value == 0_n) {
     stride = 1;
   }
 
   coord_t output_dim_size =
-      output_grad.shape.at(m.legion_dim).unwrap_nonnegative();
+      output_grad.shape.at(m.legion_dim).int_from_positive_int();
   coord_t input_dim_size =
-      input_grad.shape.at(m.legion_dim).unwrap_nonnegative();
+      input_grad.shape.at(m.legion_dim).int_from_positive_int();
 
   assert(index.data_type == DataType::INT32 ||
          index.data_type == DataType::INT64);
@@ -180,7 +180,7 @@ void backward_kernel(ffStream_t stream,
       output_grad,
       index,
       input_grad,
-      output_grad.shape.get_volume().unwrap_nonnegative(),
+      output_grad.shape.num_elements().int_from_positive_int(),
       stride,
       input_dim_size,
       output_dim_size);
diff --git a/lib/kernels/src/cuda/ops/partition_kernels.cu b/lib/kernels/src/cuda/ops/partition_kernels.cu
index b8dfac5204..e4a83a12c8 100644
--- a/lib/kernels/src/cuda/ops/partition_kernels.cu
+++ b/lib/kernels/src/cuda/ops/partition_kernels.cu
@@ -29,8 +29,8 @@ struct ForwardKernel {
                   GenericTensorAccessorW const &output) {
     checkCUDA(cudaMemcpyAsync(output.get<T>(),
                               input.get<T>(),
-                              input.shape.num_elements().unwrap_nonnegative() *
-                                  size_of_datatype(T).unwrap_nonnegative(),
+                              input.shape.num_elements().int_from_positive_int() *
+                                  size_of_datatype(T).int_from_positive_int(),
                               cudaMemcpyDeviceToDevice,
                               stream));
   }
@@ -43,12 +43,12 @@ struct BackwardKernel {
                   GenericTensorAccessorR const &output_grad,
                   GenericTensorAccessorW const &input_grad) {
     add_kernel<real_type_t<T>>
-        <<<GET_BLOCKS(input_grad.shape.num_elements().unwrap_nonnegative()),
+        <<<GET_BLOCKS(input_grad.shape.num_elements().int_from_positive_int()),
            CUDA_NUM_THREADS,
            0,
            stream>>>(input_grad.get<T>(),
                      output_grad.get<T>(),
-                     input_grad.shape.num_elements().unwrap_nonnegative());
+                     input_grad.shape.num_elements().int_from_positive_int());
   }
 };
 
diff --git a/lib/kernels/src/cuda/ops/reduction_kernels.cu b/lib/kernels/src/cuda/ops/reduction_kernels.cu
index d9c09b082d..ac3b7c9b08 100644
--- a/lib/kernels/src/cuda/ops/reduction_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reduction_kernels.cu
@@ -42,12 +42,12 @@ struct ForwardKernel {
                   size_t num_replicas) {
 
     size_t total_elements =
-        input.shape.num_elements().unwrap_nonnegative() * num_replicas;
+        input.shape.num_elements().int_from_positive_int() * num_replicas;
     reduction_forward_kernel<real_type_t<T>>
         <<<GET_BLOCKS(total_elements), CUDA_NUM_THREADS, 0, stream>>>(
             input.get<T>(),
             output.get<T>(),
-            input.shape.num_elements().unwrap_nonnegative(),
+            input.shape.num_elements().int_from_positive_int(),
             num_replicas);
   }
 };
@@ -59,8 +59,8 @@ struct BackwardKernel {
                   GenericTensorAccessorW const &input) {
     checkCUDA(cudaMemcpyAsync(input.get<T>(),
                               output.get<T>(),
-                              input.shape.num_elements().unwrap_nonnegative() *
-                                  size_of_datatype(T).unwrap_nonnegative(),
+                              input.shape.num_elements().int_from_positive_int() *
+                                  size_of_datatype(T).int_from_positive_int(),
                               cudaMemcpyDeviceToDevice,
                               stream));
   }
diff --git a/lib/kernels/src/cuda/ops/replicate_kernels.cu b/lib/kernels/src/cuda/ops/replicate_kernels.cu
index 4685fd7a2d..23e65cc1f3 100644
--- a/lib/kernels/src/cuda/ops/replicate_kernels.cu
+++ b/lib/kernels/src/cuda/ops/replicate_kernels.cu
@@ -40,8 +40,8 @@ struct ForwardKernel {
                   GenericTensorAccessorW const &output) {
     checkCUDA(cudaMemcpyAsync((void *)output.get<T>(),
                               (void *)input.get<T>(),
-                              input.shape.num_elements().unwrap_nonnegative() *
-                                  size_of_datatype(T).unwrap_nonnegative(),
+                              input.shape.num_elements().int_from_positive_int() *
+                                  size_of_datatype(T).int_from_positive_int(),
                               cudaMemcpyDeviceToDevice,
                               stream));
   }
@@ -54,12 +54,12 @@ struct BackwardKernel {
                   GenericTensorAccessorW const &input,
                   size_t num_replicas) {
     size_t total_elements =
-        input.shape.num_elements().unwrap_nonnegative() * num_replicas;
+        input.shape.num_elements().int_from_positive_int() * num_replicas;
     replicate_backward_kernel<real_type_t<T>>
         <<<GET_BLOCKS(total_elements), CUDA_NUM_THREADS, 0, stream>>>(
             output.get<T>(),
             input.get<T>(),
-            input.shape.num_elements().unwrap_nonnegative(),
+            input.shape.num_elements().int_from_positive_int(),
             num_replicas);
   }
 };
diff --git a/lib/kernels/src/cuda/ops/reshape_kernels.cu b/lib/kernels/src/cuda/ops/reshape_kernels.cu
index a6a390b38e..06aa8d74b2 100644
--- a/lib/kernels/src/cuda/ops/reshape_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reshape_kernels.cu
@@ -33,8 +33,8 @@ struct ForwardKernel {
                   GenericTensorAccessorW const &output) {
     checkCUDA(cudaMemcpyAsync(output.get<T>(),
                               input.get<T>(),
-                              input.shape.num_elements().unwrap_nonnegative() *
-                                  size_of_datatype(T).unwrap_nonnegative(),
+                              input.shape.num_elements().int_from_positive_int() *
+                                  size_of_datatype(T).int_from_positive_int(),
                               cudaMemcpyDeviceToDevice,
                               stream));
   }
@@ -47,12 +47,12 @@ struct BackwardKernel {
                   GenericTensorAccessorW const &input) {
     float alpha = 1.0f;
     apply_add_with_scale<real_type_t<T>>
-        <<<GET_BLOCKS(input.shape.num_elements().unwrap_nonnegative()),
+        <<<GET_BLOCKS(input.shape.num_elements().int_from_positive_int()),
            CUDA_NUM_THREADS,
            0,
            stream>>>(input.get<T>(),
                      output.get<T>(),
-                     input.shape.num_elements().unwrap_nonnegative(),
+                     input.shape.num_elements().int_from_positive_int(),
                      static_cast<real_type_t<T>>(alpha));
   }
 };
diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu
index 582aa02386..c63be7f9b4 100644
--- a/lib/kernels/src/cuda/ops/reverse_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu
@@ -63,10 +63,10 @@ void forward_kernel(ffStream_t stream,
       stream,
       input_accessor.get_float_ptr(),
       output_accessor.get_float_ptr(),
-      reverse_kernels_params.num_out_blks.unwrap_nonnegative(),
-      reverse_kernels_params.reverse_dim_size.unwrap_nonnegative(),
-      reverse_kernels_params.in_blk_size.unwrap_nonnegative(),
-      reverse_kernels_params.out_size.unwrap_nonnegative());
+      reverse_kernels_params.num_out_blks.int_from_positive_int(),
+      reverse_kernels_params.reverse_dim_size.int_from_positive_int(),
+      reverse_kernels_params.in_blk_size.int_from_positive_int(),
+      reverse_kernels_params.out_size.int_from_positive_int());
 }
 
 void backward_kernel_internal(cudaStream_t stream,
@@ -95,10 +95,10 @@ void backward_kernel(ffStream_t stream,
       stream,
       output_grad_accessor.get_float_ptr(),
       input_grad_accessor.get_float_ptr(),
-      reverse_kernels_params.num_out_blks.unwrap_nonnegative(),
-      reverse_kernels_params.reverse_dim_size.unwrap_nonnegative(),
-      reverse_kernels_params.in_blk_size.unwrap_nonnegative(),
-      reverse_kernels_params.out_size.unwrap_nonnegative());
+      reverse_kernels_params.num_out_blks.int_from_positive_int(),
+      reverse_kernels_params.reverse_dim_size.int_from_positive_int(),
+      reverse_kernels_params.in_blk_size.int_from_positive_int(),
+      reverse_kernels_params.out_size.int_from_positive_int());
 }
 
 } // namespace FlexFlow::Kernels::Reverse
diff --git a/lib/kernels/src/cuda/ops/transpose_kernels.cu b/lib/kernels/src/cuda/ops/transpose_kernels.cu
index 91f3d48a35..13162a9888 100644
--- a/lib/kernels/src/cuda/ops/transpose_kernels.cu
+++ b/lib/kernels/src/cuda/ops/transpose_kernels.cu
@@ -77,9 +77,9 @@ void forward_kernel(cudaStream_t stream,
       info.out_strides[i] = 1;
     } else {
       int in_dim_size =
-          input.shape.at(legion_dim_t{nonnegative_int{i}}).unwrap_nonnegative();
+          input.shape.at(legion_dim_t{nonnegative_int{i}}).int_from_positive_int();
       int out_dim_size = output.shape.at(legion_dim_t{nonnegative_int{i}})
-                             .unwrap_nonnegative();
+                             .int_from_positive_int();
       info.in_strides[i] = info.in_strides[i - 1] * in_dim_size;
       info.out_strides[i] = info.out_strides[i - 1] * out_dim_size;
     }
@@ -88,10 +88,10 @@ void forward_kernel(cudaStream_t stream,
                        .value.unwrap_nonnegative();
   }
   transpose_simple_kernel<<<
-      GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+      GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
       CUDA_NUM_THREADS,
       0,
-      stream>>>(output.shape.get_volume().unwrap_nonnegative(),
+      stream>>>(output.shape.num_elements().int_from_positive_int(),
                 input.get_float_ptr(),
                 output.get_float_ptr(),
                 info,
@@ -116,9 +116,9 @@ void backward_kernel(cudaStream_t stream,
       info.out_strides[i] = 1;
     } else {
       int in_dim_size = out_grad.shape.at(legion_dim_t{nonnegative_int{i}})
-                            .unwrap_nonnegative();
+                            .int_from_positive_int();
       int out_dim_size = in_grad.shape.at(legion_dim_t{nonnegative_int{i}})
-                             .unwrap_nonnegative();
+                             .int_from_positive_int();
       info.in_strides[i] = info.in_strides[i - 1] * in_dim_size;
       info.out_strides[i] = info.out_strides[i - 1] * out_dim_size;
     }
@@ -126,10 +126,10 @@ void backward_kernel(cudaStream_t stream,
                   .value.unwrap_nonnegative()] = i;
   }
   transpose_simple_kernel<<<
-      GET_BLOCKS(in_grad.shape.get_volume().unwrap_nonnegative()),
+      GET_BLOCKS(in_grad.shape.num_elements().int_from_positive_int()),
       CUDA_NUM_THREADS,
       0,
-      stream>>>(in_grad.shape.get_volume().unwrap_nonnegative(),
+      stream>>>(in_grad.shape.num_elements().int_from_positive_int(),
                 out_grad.get_float_ptr(),
                 in_grad.get_float_ptr(),
                 info,
diff --git a/lib/kernels/src/cuda/optimizer_kernels.cu b/lib/kernels/src/cuda/optimizer_kernels.cu
index fe817876ce..e1ab7eb92c 100644
--- a/lib/kernels/src/cuda/optimizer_kernels.cu
+++ b/lib/kernels/src/cuda/optimizer_kernels.cu
@@ -167,7 +167,7 @@ __host__ void adam_ps_update_task_gpu(ffStream_t stream,
 }
 
 #ifdef FF_USE_NCCL
-__host__ void nccl_update_task_gpu(ffStream_t stream,
+__host__ void adam_nccl_update_task_gpu(ffStream_t stream,
                                    float alpha_t,
                                    float beta1,
                                    float beta2,
diff --git a/lib/kernels/src/kernels/accessor.cc b/lib/kernels/src/kernels/accessor.cc
index 409b7533f9..46137c3c9c 100644
--- a/lib/kernels/src/kernels/accessor.cc
+++ b/lib/kernels/src/kernels/accessor.cc
@@ -15,7 +15,7 @@ nonnegative_int
          "Number of indices does not match the number of dimensions");
 
   nonnegative_int offset = 0_n;
-  nonnegative_int multiplier = 1_n;
+  positive_int multiplier = 1_p;
 
   for (legion_dim_t dim : reversed(vector_of(key_range(shape.dims)))) {
     ASSERT(indices.at(dim) < shape.at(legion_dim_t{dim}),
@@ -33,8 +33,8 @@ void copy_accessor_data_to_l_from_r(
     GenericTensorAccessorW &dst_accessor,
     GenericTensorAccessorR const &src_accessor) {
   size_t num_bytes =
-      dst_accessor.shape.get_volume().unwrap_nonnegative() *
-      size_of_datatype(dst_accessor.data_type).unwrap_nonnegative();
+      dst_accessor.shape.num_elements().int_from_positive_int() *
+      size_of_datatype(dst_accessor.data_type).int_from_positive_int();
 
   DeviceType dst_device_type = dst_accessor.device_type;
   DeviceType src_device_type = src_accessor.device_type;
@@ -221,12 +221,60 @@ std::vector<half const *>
   return get<DataType::HALF>(a);
 }
 
+int32_t *get_int32_ptr(GenericTensorAccessorW const &a) {
+  return get<DataType::INT32>(a);
+}
+
+int64_t *get_int64_ptr(GenericTensorAccessorW const &a) {
+  return get<DataType::INT64>(a);
+}
+
+float *get_float_ptr(GenericTensorAccessorW const &a) {
+  return get<DataType::FLOAT>(a);
+}
+
+double *get_double_ptr(GenericTensorAccessorW const &a) {
+  return get<DataType::DOUBLE>(a);
+}
+
+half *get_half_ptr(GenericTensorAccessorW const &a) {
+  return get<DataType::HALF>(a);
+}
+
+std::vector<int32_t *>
+    get_int32_ptrs(std::vector<GenericTensorAccessorW> const &a) {
+  return get<DataType::INT32>(a);
+}
+
+std::vector<int64_t *>
+    get_int64_ptrs(std::vector<GenericTensorAccessorW> const &a) {
+  return get<DataType::INT64>(a);
+}
+
+std::vector<float *>
+    get_float_ptrs(std::vector<GenericTensorAccessorW> const &a) {
+  return get<DataType::FLOAT>(a);
+}
+
+std::vector<double *>
+    get_double_ptrs(std::vector<GenericTensorAccessorW> const &a) {
+  return get<DataType::DOUBLE>(a);
+}
+
+std::vector<half *>
+    get_half_ptrs(std::vector<GenericTensorAccessorW> const &a) {
+  return get<DataType::HALF>(a);
+}
+
+
 GenericTensorAccessorR read_only_accessor_from_write_accessor(
     GenericTensorAccessorW const &writable) {
-  return GenericTensorAccessorR{writable.data_type,
-                                writable.shape,
-                                req<void const *>(writable.ptr),
-                                writable.device_type};
+  return GenericTensorAccessorR{
+    writable.data_type,
+    writable.shape,
+    writable.ptr,
+    writable.device_type,
+  };
 }
 
 bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1,
@@ -263,4 +311,7 @@ std::pair<ArrayShape, DataType>
   return std::make_pair(accessor.shape, accessor.data_type);
 }
 
+template
+  int32_t accessor_get_only_value<DataType::INT32>(GenericTensorAccessorR const &);
+
 } // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/allocation.cc b/lib/kernels/src/kernels/allocation.cc
index b9f253bcff..a6881d240a 100644
--- a/lib/kernels/src/kernels/allocation.cc
+++ b/lib/kernels/src/kernels/allocation.cc
@@ -18,7 +18,7 @@ DeviceType Allocator::get_allocation_device_type() const {
 GenericTensorAccessorW
     Allocator::allocate_tensor(TensorShape const &tensor_shape) {
   void *ptr =
-      this->allocate(get_size_in_bytes(tensor_shape).unwrap_nonnegative());
+      this->allocate(get_size_in_bytes(tensor_shape).int_from_positive_int());
   return GenericTensorAccessorW{
       tensor_shape.data_type,
       array_shape_from_tensor_shape(tensor_shape),
diff --git a/lib/kernels/src/kernels/array_shape.cc b/lib/kernels/src/kernels/array_shape.cc
index 34a53c1bb3..18b8861164 100644
--- a/lib/kernels/src/kernels/array_shape.cc
+++ b/lib/kernels/src/kernels/array_shape.cc
@@ -11,40 +11,31 @@
 #include "utils/hash/tuple.h"
 #include "utils/hash/vector.h"
 #include "utils/nonnegative_int/num_elements.h"
+#include "op-attrs/ff_ordered/get_idxs.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
 
 namespace FlexFlow {
 
-ArrayShape::ArrayShape(LegionOrdered<nonnegative_int> const &input_dims)
+ArrayShape::ArrayShape(LegionOrdered<positive_int> const &input_dims)
     : dims(input_dims) {}
 
-nonnegative_int ArrayShape::get_volume() const {
-  return this->num_elements();
-}
-
 nonnegative_int ArrayShape::num_dims() const {
   return ::FlexFlow::num_elements(this->dims);
 }
 
-nonnegative_int ArrayShape::get_dim() const {
-  return this->num_dims();
-}
-
-nonnegative_int ArrayShape::num_elements() const {
-  if (dims.size() == 0) {
-    return 0_n;
-  }
+positive_int ArrayShape::num_elements() const {
   return product(this->dims);
 }
 
-nonnegative_int ArrayShape::operator[](legion_dim_t idx) const {
+positive_int ArrayShape::operator[](legion_dim_t idx) const {
   return dims.at(idx);
 }
 
-nonnegative_int ArrayShape::at(legion_dim_t idx) const {
+positive_int ArrayShape::at(legion_dim_t idx) const {
   return dims.at(idx);
 }
 
-nonnegative_int ArrayShape::at(ff_dim_t idx) const {
+positive_int ArrayShape::at(ff_dim_t idx) const {
   return dims.at(legion_dim_from_ff_dim(idx, this->num_dims()));
 }
 
@@ -59,9 +50,9 @@ bool ArrayShape::operator!=(ArrayShape const &other) const {
 ArrayShape
     ArrayShape::sub_shape(ff_dim_t const &start,
                           std::optional<ff_dim_t> const &maybe_end) const {
-  FFOrdered<nonnegative_int> ff_ordered_dims =
+  FFOrdered<positive_int> ff_ordered_dims =
       ff_ordered_from_legion_ordered(this->dims);
-  FFOrdered<nonnegative_int> sliced = slice(ff_ordered_dims, start, maybe_end);
+  FFOrdered<positive_int> sliced = slice(ff_ordered_dims, start, maybe_end);
   return ArrayShape{legion_ordered_from_ff_ordered(sliced)};
 }
 
@@ -71,7 +62,7 @@ ArrayShape
   return ArrayShape{slice(this->dims, start, maybe_end)};
 }
 
-std::optional<nonnegative_int> ArrayShape::at_maybe(legion_dim_t index) const {
+std::optional<positive_int> ArrayShape::at_maybe(legion_dim_t index) const {
   if (index.value < dims.size()) {
     return dims.at(index);
   } else {
@@ -79,11 +70,11 @@ std::optional<nonnegative_int> ArrayShape::at_maybe(legion_dim_t index) const {
   }
 }
 
-std::optional<nonnegative_int> ArrayShape::at_maybe(ff_dim_t index) const {
+std::optional<positive_int> ArrayShape::at_maybe(ff_dim_t index) const {
   return this->at_maybe(legion_dim_from_ff_dim(index, this->num_dims()));
 }
 
-std::tuple<LegionOrdered<nonnegative_int> const &> ArrayShape::tie() const {
+std::tuple<LegionOrdered<positive_int> const &> ArrayShape::tie() const {
   return std::tie(this->dims);
 }
 
@@ -99,8 +90,8 @@ std::ostream &operator<<(std::ostream &s, ArrayShape const &x) {
   return (s << fmt::to_string(x));
 }
 
-nonnegative_int get_volume(ArrayShape const &shape) {
-  return shape.get_volume();
+positive_int get_num_elements(ArrayShape const &shape) {
+  return shape.num_elements();
 }
 
 ArrayShape array_shape_from_tensor_shape(TensorShape const &tensor_shape) {
@@ -113,11 +104,15 @@ TensorShape get_tensor_shape(ArrayShape const &shape, DataType dtype) {
                      dtype};
 }
 
+std::unordered_set<ff_dim_t> get_ff_dim_t_set(ArrayShape const &shape) {
+  return unordered_set_of(get_idxs(ff_ordered_from_legion_ordered(shape.dims)));
+}
+
 std::unordered_set<ArrayCoord> get_array_coord_set(ArrayShape const &shape) {
   std::vector<std::vector<nonnegative_int>> per_dim_ranges =
       transform(vector_of(ff_ordered_from_legion_ordered(shape.dims)),
-                [](nonnegative_int dim_size) -> std::vector<nonnegative_int> {
-                  return nonnegative_range(dim_size);
+                [](positive_int dim_size) -> std::vector<nonnegative_int> {
+                  return nonnegative_range(dim_size.nonnegative_int_from_positive_int());
                 });
 
   std::unordered_set<std::vector<nonnegative_int>> raw_points =
@@ -129,6 +124,18 @@ std::unordered_set<ArrayCoord> get_array_coord_set(ArrayShape const &shape) {
                    });
 }
 
+ArrayShape array_shape_drop_dims(ArrayShape const &shape,
+                                 std::function<bool(ff_dim_t)> const &should_drop_dim) {
+  std::vector<positive_int> result;
+  for (ff_dim_t idx : get_idxs(ff_ordered_from_legion_ordered(shape.dims))) {
+    if (!should_drop_dim(idx)) {
+      result.push_back(shape.at(idx));
+    }
+  }
+
+  return ArrayShape{legion_ordered_from_ff_ordered(ff_ordered_of(result))};
+}
+
 } // namespace FlexFlow
 
 namespace std {
diff --git a/lib/kernels/src/kernels/compare_tensor_accessors.cc b/lib/kernels/src/kernels/compare_tensor_accessors.cc
index 4594fed322..b1f5fd39b7 100644
--- a/lib/kernels/src/kernels/compare_tensor_accessors.cc
+++ b/lib/kernels/src/kernels/compare_tensor_accessors.cc
@@ -6,45 +6,57 @@ namespace FlexFlow {
 GenericTensorAccessorW compare_tensor_accessors_lt(GenericTensorAccessorR const &lhs,
                                                    GenericTensorAccessorR const &rhs,
                                                    Allocator &output_allocator) {
-  return map_tensor_accessors2(lhs, rhs, output_allocator, 
-                               [](auto const &l, auto const &r) { return l < r; });
+  return map_tensor_accessors2(lhs, rhs, 
+                               DataType::BOOL,
+                               [](auto const &l, auto const &r) { return l < r; },
+                               output_allocator);
 }
 
 GenericTensorAccessorW compare_tensor_accessors_le(GenericTensorAccessorR const &lhs,
                                                    GenericTensorAccessorR const &rhs,
                                                    Allocator &output_allocator) {
-  return map_tensor_accessors2(lhs, rhs, output_allocator, 
-                               [](auto const &l, auto const &r) { return l <= r; });
+  return map_tensor_accessors2(lhs, rhs,
+                               DataType::BOOL,
+                               [](auto const &l, auto const &r) { return l <= r; },
+                               output_allocator);
 }
 
 
 GenericTensorAccessorW compare_tensor_accessors_gt(GenericTensorAccessorR const &lhs,
                                                    GenericTensorAccessorR const &rhs,
                                                    Allocator &output_allocator) {
-  return map_tensor_accessors2(lhs, rhs, output_allocator, 
-                               [](auto const &l, auto const &r) { return l > r; });
+  return map_tensor_accessors2(lhs, rhs, 
+                               DataType::BOOL,
+                               [](auto const &l, auto const &r) { return l > r; },
+                               output_allocator);
 }
 
 GenericTensorAccessorW compare_tensor_accessors_ge(GenericTensorAccessorR const &lhs,
                                                    GenericTensorAccessorR const &rhs,
                                                    Allocator &output_allocator) {
-  return map_tensor_accessors2(lhs, rhs, output_allocator, 
-                               [](auto const &l, auto const &r) { return l >= r; });
+  return map_tensor_accessors2(lhs, rhs, 
+                               DataType::BOOL,
+                               [](auto const &l, auto const &r) { return l >= r; },
+                               output_allocator);
 }
 
 GenericTensorAccessorW compare_tensor_accessors_eq(GenericTensorAccessorR const &lhs,
                                                    GenericTensorAccessorR const &rhs,
                                                    Allocator &output_allocator) {
-  return map_tensor_accessors2(lhs, rhs, output_allocator, 
-                               [](auto const &l, auto const &r) { return l == r; });
+  return map_tensor_accessors2(lhs, rhs, 
+                               DataType::BOOL,
+                               [](auto const &l, auto const &r) { return l == r; },
+                               output_allocator);
 }
 
 
 GenericTensorAccessorW compare_tensor_accessors_ne(GenericTensorAccessorR const &lhs,
                                                    GenericTensorAccessorR const &rhs,
                                                    Allocator &output_allocator) {
-  return map_tensor_accessors2(lhs, rhs, output_allocator, 
-                               [](auto const &l, auto const &r) { return l != r; });
+  return map_tensor_accessors2(lhs, rhs, 
+                               DataType::BOOL,
+                               [](auto const &l, auto const &r) { return l != r; },
+                               output_allocator);
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/fill_tensor_accessor.cc b/lib/kernels/src/kernels/fill_tensor_accessor.cc
new file mode 100644
index 0000000000..f173bd0860
--- /dev/null
+++ b/lib/kernels/src/kernels/fill_tensor_accessor.cc
@@ -0,0 +1,26 @@
+#include "kernels/fill_tensor_accessor.h"
+#include "op-attrs/datatype_value.h"
+
+namespace FlexFlow {
+
+void fill_tensor_accessor(GenericTensorAccessorW &accessor, DataTypeValue val) {
+  ASSERT(accessor.device_type == DeviceType::CPU);
+  ASSERT(accessor.data_type == get_data_type_of_data_type_value(val));
+
+}
+
+GenericTensorAccessorW create_accessor_w_filled_with(TensorShape const &shape,
+                                                     DataTypeValue val,
+                                                     Allocator const &allocator) {
+  NOT_IMPLEMENTED();
+}
+
+GenericTensorAccessorR create_accessor_r_filled_with(TensorShape const &shape,
+                                                     DataTypeValue val,
+                                                     Allocator const &allocator) {
+  return read_only_accessor_from_write_accessor(
+    create_accessor_w_filled_with(shape, val, allocator));
+}
+
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/format_accessor_contents.cc b/lib/kernels/src/kernels/format_accessor_contents.cc
index 1b8ab35d89..3d24483967 100644
--- a/lib/kernels/src/kernels/format_accessor_contents.cc
+++ b/lib/kernels/src/kernels/format_accessor_contents.cc
@@ -4,6 +4,7 @@
 #include "kernels/local_cpu_allocator.h"
 #include "utils/indent.h"
 #include <libassert/assert.hpp>
+#include "utils/nonnegative_int/nonnegative_range.h"
 
 namespace FlexFlow {
 
@@ -15,10 +16,10 @@ struct Print1DCPUAccessorR {
     nonnegative_int dims = accessor.shape.num_dims();
     ASSERT(dims == 1_n);
 
-    nonnegative_int ncols = accessor.shape.at(ff_dim_t{0_n});
+    positive_int ncols = accessor.shape.at(ff_dim_t{0_n});
 
     stream << "["
-           << join_strings(nonnegative_range(ncols),
+           << join_strings(nonnegative_range(ncols.nonnegative_int_from_positive_int()),
                            " ",
                            [&](nonnegative_int col_idx) -> std::string {
                              return fmt::to_string(
@@ -45,12 +46,12 @@ struct Print2DCPUAccessorR {
     ASSERT(accessor.device_type == DeviceType::CPU);
     nonnegative_int dims = accessor.shape.num_dims();
     ASSERT(dims == 2_n);
-    nonnegative_int dim0_size = accessor.shape.at(ff_dim_t{0_n});
-    nonnegative_int dim1_size = accessor.shape.at(ff_dim_t{1_n});
+    positive_int dim0_size = accessor.shape.at(ff_dim_t{0_n});
+    positive_int dim1_size = accessor.shape.at(ff_dim_t{1_n});
 
     auto render_1d = [&](nonnegative_int dim0_idx) -> std::string {
       return "[" +
-             join_strings(nonnegative_range(dim1_size),
+             join_strings(nonnegative_range(dim1_size.nonnegative_int_from_positive_int()),
                           " ",
                           [&](nonnegative_int dim1_idx) -> std::string {
                             return fmt::to_string(
@@ -61,7 +62,7 @@ struct Print2DCPUAccessorR {
 
     stream << "[\n"
            << indent(
-                  join_strings(nonnegative_range(dim0_size), "\n", render_1d))
+                  join_strings(nonnegative_range(dim0_size.nonnegative_int_from_positive_int()), "\n", render_1d))
            << "\n]";
   }
 };
@@ -84,14 +85,14 @@ struct Print3DCPUAccessorR {
     nonnegative_int dims = accessor.shape.num_dims();
     ASSERT(dims == 3_n);
 
-    nonnegative_int dim0_size = accessor.shape.at(ff_dim_t{0_n});
-    nonnegative_int dim1_size = accessor.shape.at(ff_dim_t{1_n});
-    nonnegative_int dim2_size = accessor.shape.at(ff_dim_t{2_n});
+    positive_int dim0_size = accessor.shape.at(ff_dim_t{0_n});
+    positive_int dim1_size = accessor.shape.at(ff_dim_t{1_n});
+    positive_int dim2_size = accessor.shape.at(ff_dim_t{2_n});
 
     auto render_1d = [&](nonnegative_int dim0_idx,
                          nonnegative_int dim1_idx) -> std::string {
       return "[" +
-             join_strings(nonnegative_range(dim2_size),
+             join_strings(nonnegative_range(dim2_size.nonnegative_int_from_positive_int()),
                           " ",
                           [&](nonnegative_int dim2_idx) -> std::string {
                             return fmt::to_string(accessor.at<DT>(
@@ -102,7 +103,7 @@ struct Print3DCPUAccessorR {
 
     auto render_2d = [&](nonnegative_int dim0_idx) -> std::string {
       return "[\n" +
-             indent(join_strings(nonnegative_range(dim1_size),
+             indent(join_strings(nonnegative_range(dim1_size.nonnegative_int_from_positive_int()),
                                  "\n",
                                  [&](nonnegative_int dim1_idx) -> std::string {
                                    return render_1d(dim0_idx, dim1_idx);
@@ -112,7 +113,7 @@ struct Print3DCPUAccessorR {
 
     stream << "[\n"
            << indent(
-                  join_strings(nonnegative_range(dim0_size), "\n", render_2d))
+                  join_strings(nonnegative_range(dim0_size.nonnegative_int_from_positive_int()), "\n", render_2d))
            << "\n]";
   }
 };
diff --git a/lib/kernels/src/kernels/map_tensor_accessors.cc b/lib/kernels/src/kernels/map_tensor_accessors.cc
index 619f1cc412..c59d2207d0 100644
--- a/lib/kernels/src/kernels/map_tensor_accessors.cc
+++ b/lib/kernels/src/kernels/map_tensor_accessors.cc
@@ -9,18 +9,19 @@ struct F1 {
 
 template
 GenericTensorAccessorW map_tensor_accessor(GenericTensorAccessorR const &,
-                                           Allocator &,
-                                           F1 &&);
+                                           F1 &&,
+                                           Allocator &);
 
 struct F2 {
-  template <typename T>
-  float operator()(T const &lhs, T const &rhs) const { NOT_IMPLEMENTED(); }
+  template <typename T1, typename T2>
+  float operator()(T1 const &lhs, T2 const &rhs) const { NOT_IMPLEMENTED(); }
 };
 
 template
   GenericTensorAccessorW map_tensor_accessors2(GenericTensorAccessorR const &,
                                                GenericTensorAccessorR const &,
-                                               Allocator &,
-                                               F2 &&);
+                                               DataType,
+                                               F2 &&,
+                                               Allocator &);
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/reduce_tensor_accessor.cc b/lib/kernels/src/kernels/reduce_tensor_accessor.cc
new file mode 100644
index 0000000000..b9c4cee085
--- /dev/null
+++ b/lib/kernels/src/kernels/reduce_tensor_accessor.cc
@@ -0,0 +1,17 @@
+#include "kernels/reduce_tensor_accessor.h"
+
+namespace FlexFlow {
+
+using F = std::function<int32_t(int32_t, float)>;
+
+template
+  GenericTensorAccessorW reduce_tensor_accessor_in_dims(
+    GenericTensorAccessorR const &,
+    std::unordered_set<ff_dim_t> const &,
+    Allocator &,
+    F &&);
+
+template
+  int32_t reduce_tensor_accessor_in_all_dims<DataType::INT32>(GenericTensorAccessorR const &, F &&);
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/reverse_kernels_params.cc b/lib/kernels/src/kernels/reverse_kernels_params.cc
index c647181872..0ad1a5ed20 100644
--- a/lib/kernels/src/kernels/reverse_kernels_params.cc
+++ b/lib/kernels/src/kernels/reverse_kernels_params.cc
@@ -6,10 +6,10 @@ ReverseKernelsParams
     compute_reverse_kernels_params(ArrayShape const &output_shape,
                                    ReverseAttrs const &attrs) {
   auto axis = attrs.axis;
-  nonnegative_int in_blk_size = 1_n;
-  nonnegative_int reverse_dim_size = 1_n;
-  nonnegative_int num_out_blks = 1_n;
-  for (nonnegative_int i : nonnegative_range(output_shape.get_dim())) {
+  positive_int in_blk_size = 1_p;
+  positive_int reverse_dim_size = 1_p;
+  positive_int num_out_blks = 1_p;
+  for (nonnegative_int i : nonnegative_range(output_shape.num_dims())) {
     if (i < axis.value) {
       in_blk_size *= output_shape.at(ff_dim_t{i});
     } else if (i == axis.value) {
@@ -23,7 +23,7 @@ ReverseKernelsParams
       num_out_blks,
       reverse_dim_size,
       in_blk_size,
-      output_shape.get_volume(),
+      output_shape.num_elements(),
   };
 }
 
diff --git a/lib/kernels/src/kernels/tensor_accessor_reductions.cc b/lib/kernels/src/kernels/tensor_accessor_reductions.cc
new file mode 100644
index 0000000000..baeb9fadc1
--- /dev/null
+++ b/lib/kernels/src/kernels/tensor_accessor_reductions.cc
@@ -0,0 +1,27 @@
+#include "kernels/tensor_accessor_reductions.h"
+#include "kernels/reduce_tensor_accessor.h"
+#include "utils/overload.h"
+
+namespace FlexFlow {
+
+bool tensor_accessor_all(GenericTensorAccessorR const &t) {
+  ASSERT(t.data_type == DataType::BOOL);
+
+  return reduce_tensor_accessor_in_all_dims<DataType::BOOL>(
+    t, overload {
+      [](bool lhs, bool rhs) -> bool { return lhs && rhs; },
+      [](auto lhs, auto rhs) -> bool { PANIC(); },
+    });
+}
+
+bool tensor_accessor_any(GenericTensorAccessorR const &t) {
+  ASSERT(t.data_type == DataType::BOOL);
+
+  return reduce_tensor_accessor_in_all_dims<DataType::BOOL>(
+    t, overload {
+      [](bool lhs, bool rhs) -> bool { return lhs || rhs; },
+      [](auto lhs, auto rhs) -> bool { PANIC(); },
+    });
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/test/CMakeLists.txt b/lib/kernels/test/CMakeLists.txt
index 066cb96753..981f87b3d8 100644
--- a/lib/kernels/test/CMakeLists.txt
+++ b/lib/kernels/test/CMakeLists.txt
@@ -16,10 +16,3 @@ ff_add_test_executable(
     cublas
     pcg
 )
-
-set(FF_TEST_EXEC_NAME "kernels-tests")
-add_custom_command(
-  TARGET ${FF_TEST_EXEC_NAME} POST_BUILD
-  COMMAND ${CMAKE_COMMAND} -DFF_TEST_EXEC_NAME=${FF_TEST_EXEC_NAME} -P ${CMAKE_CURRENT_LIST_DIR}/modify_test_commands.cmake
-  DEPENDS ${FF_TEST_EXEC_NAME}
-)
diff --git a/lib/kernels/test/modify_test_commands.cmake b/lib/kernels/test/modify_test_commands.cmake
deleted file mode 100644
index 6494ae2d78..0000000000
--- a/lib/kernels/test/modify_test_commands.cmake
+++ /dev/null
@@ -1,21 +0,0 @@
-# modify_test_commands.cmake
-
-file(GLOB ctest_tests_files "${CMAKE_CURRENT_BINARY_DIR}/${FF_TEST_EXEC_NAME}_tests-*.cmake")
-
-foreach(ctest_tests_file IN LISTS ctest_tests_files)
-  file(READ "${ctest_tests_file}" content)
-
-  # add nix run prefix
-  string(REGEX REPLACE 
-    "add_test\\([ \t\r\n]*\\[==\\[([^]]+)\\]==\\][ \t\r\n]+([^ ]+)[ \t\r\n]+\\[==\\[([^]]+)\\]==\\]\\)" 
-    "add_test( [==[\\1]==] nixGL -- \\2 [==[\\3]==])" 
-    content "${content}")
-
-  # add environment
-  # string(REGEX REPLACE 
-  #   "set_tests_properties\\([ \t\r\n]*\\[==\\[([^]]+)\\]==\\][ \t\r\n]+PROPERTIES[ \t\r\n]+([^)]+)\\)" 
-  #   "set_tests_properties( [==[\\1]==] PROPERTIES \\2 ENVIRONMENT \"NIXPKGS_ALLOW_UNFREE=1\")" 
-  #   content "${content}")
-
-  file(WRITE "${ctest_tests_file}" "${content}")
-endforeach()
diff --git a/lib/kernels/test/src/cpu/ops/replicate_kernels.cc b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc
index 6c35185524..be1e3832ff 100644
--- a/lib/kernels/test/src/cpu/ops/replicate_kernels.cc
+++ b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc
@@ -15,7 +15,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         create_1d_accessor_r_with_contents<int32_t>({1, 3, 2}, cpu_allocator);
 
     TensorShape result_shape = TensorShape{
-        TensorDims{FFOrdered{3_n}},
+        TensorDims{FFOrdered{3_p}},
         DataType::INT32,
     };
     GenericTensorAccessorW result =
@@ -45,7 +45,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator);
 
     TensorShape result_shape = TensorShape{
-        TensorDims{FFOrdered{3_n}},
+        TensorDims{FFOrdered{3_p}},
         DataType::INT32,
     };
     GenericTensorAccessorW result =
diff --git a/lib/kernels/test/src/cpu/ops/reverse_kernels.cc b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc
index 8c54f4453b..9e0f38c8d6 100644
--- a/lib/kernels/test/src/cpu/ops/reverse_kernels.cc
+++ b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc
@@ -26,7 +26,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     GenericTensorAccessorW result = create_zero_filled_accessor_w(
         TensorShape{
-            TensorDims{FFOrdered{2_n, 2_n, 3_n}},
+            TensorDims{FFOrdered{2_p, 2_p, 3_p}},
             DataType::INT32,
         },
         cpu_allocator);
@@ -122,7 +122,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     GenericTensorAccessorW result = create_zero_filled_accessor_w(
         TensorShape{
-            TensorDims{FFOrdered{2_n, 2_n, 3_n}},
+            TensorDims{FFOrdered{2_p, 2_p, 3_p}},
             DataType::INT32,
         },
         cpu_allocator);
diff --git a/lib/kernels/test/src/internal/test_utils.cc b/lib/kernels/test/src/internal/test_utils.cc
index b20ea8ee6b..1d08adb56a 100644
--- a/lib/kernels/test/src/internal/test_utils.cc
+++ b/lib/kernels/test/src/internal/test_utils.cc
@@ -33,7 +33,7 @@ struct CreateRandomFilledAccessorW {
 
     std::random_device rd;
     std::mt19937 gen(rd());
-    size_t num_elements = get_num_elements(shape).unwrap_nonnegative();
+    size_t num_elements = get_num_elements(shape).int_from_positive_int();
     if constexpr (std::is_same<T, bool>::value) {
       std::bernoulli_distribution dist(0.5);
       for (size_t i = 0; i < num_elements; i++) {
@@ -80,11 +80,11 @@ struct FillWithZeros {
     if (accessor.device_type == DeviceType::CPU) {
       memset(accessor.ptr,
              0,
-             accessor.shape.get_volume().unwrap_nonnegative() * sizeof(T));
+             accessor.shape.num_elements().int_from_positive_int() * sizeof(T));
     } else {
       checkCUDA(cudaMemset(accessor.ptr,
                            0,
-                           accessor.shape.get_volume().unwrap_nonnegative() *
+                           accessor.shape.num_elements().int_from_positive_int() *
                                sizeof(T)));
     }
   }
@@ -101,7 +101,7 @@ struct CPUAccessorRContainsNonZero {
 
     T const *data_ptr = accessor.get<DT>();
 
-    int volume = accessor.shape.num_elements().unwrap_nonnegative();
+    int volume = accessor.shape.num_elements().int_from_positive_int();
     for (size_t i = 0; i < volume; i++) {
       if (data_ptr[i] != 0) {
         return true;
@@ -134,7 +134,7 @@ struct AccessorsAreEqual {
     T const *a_data_ptr = cpu_accessor_a.get<DT>();
     T const *b_data_ptr = cpu_accessor_b.get<DT>();
 
-    int volume = accessor_a.shape.num_elements().unwrap_nonnegative();
+    int volume = accessor_a.shape.num_elements().int_from_positive_int();
     for (size_t i = 0; i < volume; i++) {
       if (a_data_ptr[i] != b_data_ptr[i]) {
         return false;
@@ -172,7 +172,7 @@ struct CreateFilledAccessorW {
 
     T *data_ptr = src_accessor.get<DT>();
 
-    int volume = dst_accessor.shape.num_elements().unwrap_nonnegative();
+    int volume = dst_accessor.shape.num_elements().int_from_positive_int();
     for (size_t i = 0; i < volume; i++) {
       data_ptr[i] = unwrapped_value;
     }
diff --git a/lib/kernels/test/src/kernels/accessor.cc b/lib/kernels/test/src/kernels/accessor.cc
index 98f8471212..2f7e908e0b 100644
--- a/lib/kernels/test/src/kernels/accessor.cc
+++ b/lib/kernels/test/src/kernels/accessor.cc
@@ -2,6 +2,7 @@
 #include "internal/test_utils.h"
 #include "kernels/local_cpu_allocator.h"
 #include <doctest/doctest.h>
+#include "kernels/create_accessor_with_contents.h"
 
 using namespace ::FlexFlow;
 
@@ -10,8 +11,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("one dimension") {
       std::vector<nonnegative_int> indices = {4_n};
       ArrayShape shape = ArrayShape{
-          std::vector<nonnegative_int>{
-              13_n,
+          std::vector{
+              13_p,
           },
       };
 
@@ -24,9 +25,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("multiple dimensions") {
       std::vector<nonnegative_int> indices = {2_n, 4_n};
       ArrayShape shape = ArrayShape{
-          std::vector<nonnegative_int>{
-              6_n,
-              5_n,
+          std::vector{
+              6_p,
+              5_p,
           },
       };
 
@@ -38,7 +39,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("zero dimensions") {
       std::vector<nonnegative_int> indices = {};
-      ArrayShape shape = ArrayShape{std::vector<nonnegative_int>{}};
+      ArrayShape shape = ArrayShape{std::vector<positive_int>{}};
 
       nonnegative_int result = calculate_accessor_offset(indices, shape);
       nonnegative_int correct = 0_n;
@@ -49,9 +50,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("index and shape dimensions do not match") {
       std::vector<nonnegative_int> indices = {1_n, 2_n, 4_n};
       ArrayShape shape = ArrayShape{
-          std::vector<nonnegative_int>{
-              6_n,
-              5_n,
+          std::vector<positive_int>{
+              6_p,
+              5_p,
           },
       };
 
@@ -61,13 +62,58 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("out of bounds index") {
       std::vector<nonnegative_int> indices = {2_n, 5_n};
       ArrayShape shape = ArrayShape{
-          std::vector<nonnegative_int>{
-              6_n,
-              5_n,
+          std::vector<positive_int>{
+              6_p,
+              5_p,
           },
       };
 
       CHECK_THROWS(calculate_accessor_offset(indices, shape));
     }
   }
+
+  TEST_CASE("accessor_get_only_value") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    SUBCASE("returns the value if the accessor only contains one value") {
+      GenericTensorAccessorR input = create_3d_accessor_r_with_contents<float>(
+          {
+            {
+              {12},
+            },
+          },
+          cpu_allocator);
+
+      float result = accessor_get_only_value<DataType::FLOAT>(input);
+      float correct = 12; 
+
+      CHECK(result == correct);
+    }
+
+
+    SUBCASE("throws an error if the requested type does not match the type in the accessor") {
+      GenericTensorAccessorR input = create_3d_accessor_r_with_contents<float>(
+          {
+            {
+              {12},
+            },
+          },
+          cpu_allocator);
+
+      CHECK_THROWS(accessor_get_only_value<DataType::DOUBLE>(input));
+    }
+
+    SUBCASE("throws an error if the accessor contains multiple values") {
+      GenericTensorAccessorR input = create_3d_accessor_r_with_contents<float>(
+          {
+            {
+              {12},
+              {12},
+            },
+          },
+          cpu_allocator);
+
+      CHECK_THROWS(accessor_get_only_value<DataType::FLOAT>(input));
+    }
+  }
 }
diff --git a/lib/kernels/test/src/kernels/array_shape.cc b/lib/kernels/test/src/kernels/array_shape.cc
index 1fb4c0b541..2665cdda36 100644
--- a/lib/kernels/test/src/kernels/array_shape.cc
+++ b/lib/kernels/test/src/kernels/array_shape.cc
@@ -8,7 +8,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_array_coord_set") {
     SUBCASE("ArrayShape is not empty") {
       ArrayShape input = ArrayShape{
-          LegionOrdered{2_n, 1_n, 3_n},
+          LegionOrdered{2_p, 1_p, 3_p},
       };
 
       std::unordered_set<ArrayCoord> result = get_array_coord_set(input);
@@ -24,26 +24,67 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(result == correct);
     }
 
-    SUBCASE("ArrayShape has a dimension of size zero") {
-      ArrayShape input = ArrayShape{
-          LegionOrdered{2_n, 0_n, 3_n},
-      };
+    SUBCASE("ArrayShape is zero-dimensional") {
+      ArrayShape input = ArrayShape{LegionOrdered<positive_int>{}};
 
       std::unordered_set<ArrayCoord> result = get_array_coord_set(input);
-      std::unordered_set<ArrayCoord> correct = {};
+      std::unordered_set<ArrayCoord> correct = {
+          ArrayCoord{FFOrdered<nonnegative_int>{}},
+      };
 
       CHECK(result == correct);
     }
+  }
 
-    SUBCASE("ArrayShape is zero-dimensional") {
-      ArrayShape input = ArrayShape{LegionOrdered<nonnegative_int>{}};
+  TEST_CASE("array_shape_drop_dims") {
+    ArrayShape input = ArrayShape{
+      LegionOrdered{2_p, 4_p, 3_p},
+    };
 
-      std::unordered_set<ArrayCoord> result = get_array_coord_set(input);
-      std::unordered_set<ArrayCoord> correct = {
-          ArrayCoord{FFOrdered<nonnegative_int>{}},
+    SUBCASE("removes dims specified to be dropped") {
+      auto should_drop_dim = [](ff_dim_t dim) -> bool {
+        return dim.value % 2_n == 0;
+      };
+
+      ArrayShape result = array_shape_drop_dims(input, should_drop_dim);
+      ArrayShape correct = ArrayShape{
+        LegionOrdered{4_p},
       };
 
       CHECK(result == correct);
     }
+
+    SUBCASE("is identity function if no dimensions are specified to be dropped") {
+      auto should_drop_dim = [](ff_dim_t dim) -> bool {
+        return false;
+      };
+
+      ArrayShape result = array_shape_drop_dims(input, should_drop_dim);
+      ArrayShape correct = input;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("is identity function if no dimensions are specified to be dropped") {
+      auto should_drop_dim = [](ff_dim_t dim) -> bool {
+        return false;
+      };
+
+      ArrayShape result = array_shape_drop_dims(input, should_drop_dim);
+      ArrayShape correct = input;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("returns empty shape if all dimensions are specified to be dropped") {
+      auto should_drop_dim = [](ff_dim_t dim) -> bool {
+        return true;
+      };
+
+      ArrayShape result = array_shape_drop_dims(input, should_drop_dim);
+      ArrayShape correct = ArrayShape{LegionOrdered<positive_int>{}};
+
+      CHECK(result == correct);
+    }
   }
 }
diff --git a/lib/kernels/test/src/kernels/compare_tensor_accessors.cc b/lib/kernels/test/src/kernels/compare_tensor_accessors.cc
index d5124180af..54706ad74e 100644
--- a/lib/kernels/test/src/kernels/compare_tensor_accessors.cc
+++ b/lib/kernels/test/src/kernels/compare_tensor_accessors.cc
@@ -54,4 +54,167 @@ TEST_SUITE(FF_TEST_SUITE) {
     CHECK_MESSAGE(accessors_are_equal(result, correct),
                   check_kv("result", format_accessor_w_contents(result)));
   }
+
+  TEST_CASE("compare_tensor_accessors_le") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorR lhs = create_3d_accessor_r_with_contents<float>(
+        {
+          {
+            {4, 2, 1},
+          },
+          {
+            {2, 1, 5},
+          },
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorR rhs = create_3d_accessor_r_with_contents<float>(
+        {
+          {
+            {5, 1, 0},
+          },
+          {
+            {2, 1, 5},
+          },
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorW result = compare_tensor_accessors_le(lhs, rhs, cpu_allocator);
+    GenericTensorAccessorR correct = create_3d_accessor_r_with_contents<bool>(
+        {
+          {
+            {true, false, false},
+          },
+          {
+            {true, true, true},
+          },
+        },
+        cpu_allocator);
+
+    CHECK_MESSAGE(accessors_are_equal(result, correct),
+                  check_kv("result", format_accessor_w_contents(result)));
+  }
+
+  TEST_CASE("compare_tensor_accessors_gt") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorR lhs = create_2d_accessor_r_with_contents<float>(
+        {
+          {4, 2, 1},
+          {2, 1, 5},
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorR rhs = create_2d_accessor_r_with_contents<float>(
+        {
+          {5, 1, 0},
+          {2, 1, 5},
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorW result = compare_tensor_accessors_gt(lhs, rhs, cpu_allocator);
+    GenericTensorAccessorR correct = create_2d_accessor_r_with_contents<bool>(
+        {
+          {false, true, true},
+          {false, false, false},
+        },
+        cpu_allocator);
+
+    CHECK_MESSAGE(accessors_are_equal(result, correct),
+                  check_kv("result", format_accessor_w_contents(result)));
+  }
+
+  TEST_CASE("compare_tensor_accessors_ge") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorR lhs = create_2d_accessor_r_with_contents<float>(
+        {
+          {4, 2},
+          {2, 5},
+          {1, 8},
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorR rhs = create_2d_accessor_r_with_contents<float>(
+        {
+          {5, 1},
+          {3, 6},
+          {1, 0},
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorW result = compare_tensor_accessors_ge(lhs, rhs, cpu_allocator);
+    GenericTensorAccessorR correct = create_2d_accessor_r_with_contents<bool>(
+        {
+          {false, true},
+          {false, false},
+          {true, true},
+        },
+        cpu_allocator);
+
+    CHECK_MESSAGE(accessors_are_equal(result, correct),
+                  check_kv("result", format_accessor_w_contents(result)));
+  }
+
+  TEST_CASE("compare_tensor_accessors_eq") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorR lhs = create_2d_accessor_r_with_contents<float>(
+        {
+          {4, 2},
+          {1, 8},
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorR rhs = create_2d_accessor_r_with_contents<float>(
+        {
+          {5, 2},
+          {1, 8},
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorW result = compare_tensor_accessors_eq(lhs, rhs, cpu_allocator);
+    GenericTensorAccessorR correct = create_2d_accessor_r_with_contents<bool>(
+        {
+          {false, true},
+          {true, true},
+        },
+        cpu_allocator);
+
+    CHECK_MESSAGE(accessors_are_equal(result, correct),
+                  check_kv("result", format_accessor_w_contents(result)));
+  }
+
+  TEST_CASE("compare_tensor_accessors_ne") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorR lhs = create_2d_accessor_r_with_contents<float>(
+        {
+          {4, 2},
+          {1, 8},
+          {1, 2},
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorR rhs = create_2d_accessor_r_with_contents<float>(
+        {
+          {5, 2},
+          {1, 8},
+          {2, 2},
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorW result = compare_tensor_accessors_ne(lhs, rhs, cpu_allocator);
+    GenericTensorAccessorR correct = create_2d_accessor_r_with_contents<bool>(
+        {
+          {true, false},
+          {false, false},
+          {true, false},
+        },
+        cpu_allocator);
+
+    CHECK_MESSAGE(accessors_are_equal(result, correct),
+                  check_kv("result", format_accessor_w_contents(result)));
+  }
 }
diff --git a/lib/kernels/test/src/kernels/create_accessor_with_contents.cc b/lib/kernels/test/src/kernels/create_accessor_with_contents.cc
new file mode 100644
index 0000000000..a6cfdbc97f
--- /dev/null
+++ b/lib/kernels/test/src/kernels/create_accessor_with_contents.cc
@@ -0,0 +1,133 @@
+#include <doctest/doctest.h>
+#include "kernels/create_accessor_with_contents.h"
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("create_1d_accessor_w_with_contents") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorW result 
+      = create_1d_accessor_w_with_contents<float>({1, 4, 1, 2}, cpu_allocator);
+
+    auto at = [&](nonnegative_int c) -> float {
+      return result.at<DataType::FLOAT>(FFOrdered<nonnegative_int>{c});
+    };
+
+    CHECK(at(0_n) == 1);
+    CHECK(at(1_n) == 4);
+    CHECK(at(2_n) == 1);
+    CHECK(at(3_n) == 2);
+  }
+
+  TEST_CASE("create_2d_accessor_w_with_contents") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorW result 
+      = create_2d_accessor_w_with_contents<float>(
+          {
+            {1, 4, 2}, 
+            {2, 2, 7}, 
+          },
+          cpu_allocator);
+
+    auto at = [&](nonnegative_int r, nonnegative_int c) -> float {
+      return result.at<DataType::FLOAT>(FFOrdered{r, c});
+    };
+
+    CHECK(at(0_n, 0_n) == 1);
+    CHECK(at(0_n, 1_n) == 4);
+    CHECK(at(0_n, 2_n) == 2);
+    CHECK(at(1_n, 0_n) == 2);
+    CHECK(at(1_n, 1_n) == 2);
+    CHECK(at(1_n, 2_n) == 7);
+  }
+
+  TEST_CASE("create_3d_accessor_w_with_contents") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorW result 
+      = create_3d_accessor_w_with_contents<float>(
+          {
+            {
+              {1, 4}, 
+              {2, 3}, 
+              {7, 2}, 
+            },
+            {
+              {9, 3}, 
+              {4, 5}, 
+              {0, 2}, 
+            },
+          },
+          cpu_allocator);
+
+    auto at = [&](nonnegative_int s, nonnegative_int r, nonnegative_int c) -> float {
+      return result.at<DataType::FLOAT>(FFOrdered{s, r, c});
+    };
+
+    CHECK(at(0_n, 0_n, 0_n) == 1);
+    CHECK(at(0_n, 0_n, 1_n) == 4);
+    CHECK(at(0_n, 1_n, 0_n) == 2);
+    CHECK(at(0_n, 1_n, 1_n) == 3);
+    CHECK(at(0_n, 2_n, 0_n) == 7);
+    CHECK(at(0_n, 2_n, 1_n) == 2);
+    CHECK(at(1_n, 0_n, 0_n) == 9);
+    CHECK(at(1_n, 0_n, 1_n) == 3);
+    CHECK(at(1_n, 1_n, 0_n) == 4);
+    CHECK(at(1_n, 1_n, 1_n) == 5);
+    CHECK(at(1_n, 2_n, 0_n) == 0);
+    CHECK(at(1_n, 2_n, 1_n) == 2);
+  }
+
+  TEST_CASE("create_4d_accessor_w_with_contents") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorW result 
+      = create_4d_accessor_w_with_contents<float>(
+          {
+            {
+              {
+                {2, 3}, 
+                {7, 2}, 
+              },
+              {
+                {4, 5}, 
+                {0, 2}, 
+              },
+            },
+            {
+              {
+                {9, 6}, 
+                {1, 2}, 
+              },
+              {
+                {8, 7}, 
+                {3, 8}, 
+              },
+            },
+          },
+          cpu_allocator);
+
+    auto at = [&](nonnegative_int s1, nonnegative_int s2, nonnegative_int r, nonnegative_int c) -> float {
+      return result.at<DataType::FLOAT>(FFOrdered{s1, s2, r, c});
+    };
+
+    CHECK(at(0_n, 0_n, 0_n, 0_n) == 2);
+    CHECK(at(0_n, 0_n, 0_n, 1_n) == 3);
+    CHECK(at(0_n, 0_n, 1_n, 0_n) == 7);
+    CHECK(at(0_n, 0_n, 1_n, 1_n) == 2);
+    CHECK(at(0_n, 1_n, 0_n, 0_n) == 4);
+    CHECK(at(0_n, 1_n, 0_n, 1_n) == 5);
+    CHECK(at(0_n, 1_n, 1_n, 0_n) == 0);
+    CHECK(at(0_n, 1_n, 1_n, 1_n) == 2);
+    CHECK(at(1_n, 0_n, 0_n, 0_n) == 9);
+    CHECK(at(1_n, 0_n, 0_n, 1_n) == 6);
+    CHECK(at(1_n, 0_n, 1_n, 0_n) == 1);
+    CHECK(at(1_n, 0_n, 1_n, 1_n) == 2);
+    CHECK(at(1_n, 1_n, 0_n, 0_n) == 8);
+    CHECK(at(1_n, 1_n, 0_n, 1_n) == 7);
+    CHECK(at(1_n, 1_n, 1_n, 0_n) == 3);
+    CHECK(at(1_n, 1_n, 1_n, 1_n) == 8);
+  }
+}
diff --git a/lib/kernels/test/src/kernels/map_tensor_accessors.cc b/lib/kernels/test/src/kernels/map_tensor_accessors.cc
new file mode 100644
index 0000000000..fcc59b7935
--- /dev/null
+++ b/lib/kernels/test/src/kernels/map_tensor_accessors.cc
@@ -0,0 +1,151 @@
+#include <doctest/doctest.h>
+#include "kernels/map_tensor_accessors.h"
+#include "kernels/create_accessor_with_contents.h"
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("map_tensor_accessor_inplace") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorW accessor = create_2d_accessor_w_with_contents<float>(
+        {
+          {1, 3, 2},
+          {2, 1, 5},
+        },
+        cpu_allocator);
+
+    map_tensor_accessor_inplace(accessor, [](float x) { return x + 1; });
+
+    auto at = [&](nonnegative_int r, nonnegative_int c) -> float {
+      return accessor.at<DataType::FLOAT>(FFOrdered{r, c});
+    };
+
+    CHECK(at(0_n, 0_n) == 2);
+    CHECK(at(0_n, 1_n) == 4);
+    CHECK(at(0_n, 2_n) == 3);
+    CHECK(at(1_n, 0_n) == 3);
+    CHECK(at(1_n, 1_n) == 2);
+    CHECK(at(1_n, 2_n) == 6);
+  }
+  
+  TEST_CASE("map_tensor_accessor") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorW input = create_2d_accessor_w_with_contents<float>(
+        {
+          {1, 3, 2},
+          {2, 1, 5},
+        },
+        cpu_allocator);
+
+    SUBCASE("function is not type changing") {
+      GenericTensorAccessorW result = map_tensor_accessor(input, [](float x) { return x + 1; }, cpu_allocator);
+
+      auto at = [&](nonnegative_int r, nonnegative_int c) -> float {
+        return result.at<DataType::FLOAT>(FFOrdered{r, c});
+      };
+
+      CHECK(at(0_n, 0_n) == 2);
+      CHECK(at(0_n, 1_n) == 4);
+      CHECK(at(0_n, 2_n) == 3);
+      CHECK(at(1_n, 0_n) == 3);
+      CHECK(at(1_n, 1_n) == 2);
+      CHECK(at(1_n, 2_n) == 6);
+    }
+
+    SUBCASE("function is type changing") {
+      GenericTensorAccessorW result = map_tensor_accessor(input, [](float x) -> bool { return x > 2; }, cpu_allocator);
+
+      auto at = [&](nonnegative_int r, nonnegative_int c) -> bool {
+        return result.at<DataType::BOOL>(FFOrdered{r, c});
+      };
+
+      CHECK(at(0_n, 0_n) == false);
+      CHECK(at(0_n, 1_n) == true);
+      CHECK(at(0_n, 2_n) == false);
+      CHECK(at(1_n, 0_n) == false);
+      CHECK(at(1_n, 1_n) == false);
+      CHECK(at(1_n, 2_n) == true);
+    }
+  }
+
+  TEST_CASE("map_tensor_accessors2") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorW lhs = create_2d_accessor_w_with_contents<float>(
+        {
+          {1, 3, 2},
+          {2, 1, 5},
+        },
+        cpu_allocator);
+
+    SUBCASE("argument types are the same") {
+      GenericTensorAccessorW rhs = create_2d_accessor_w_with_contents<float>(
+          {
+            {0, 2, 5},
+            {3, 3, 8},
+          },
+          cpu_allocator);
+
+      SUBCASE("function is not type changing") {
+        GenericTensorAccessorW result = map_tensor_accessors2(lhs, rhs, DataType::FLOAT, [](float l, float r) { return l + 2 * r; }, cpu_allocator);
+
+        auto at = [&](nonnegative_int r, nonnegative_int c) -> float {
+          return result.at<DataType::FLOAT>(FFOrdered{r, c});
+        };
+
+        CHECK(at(0_n, 0_n) == 1);
+        CHECK(at(0_n, 1_n) == 7);
+        CHECK(at(0_n, 2_n) == 12);
+        CHECK(at(1_n, 0_n) == 8);
+        CHECK(at(1_n, 1_n) == 7);
+        CHECK(at(1_n, 2_n) == 21);
+      }
+
+      SUBCASE("function is type changing") {
+        GenericTensorAccessorW result = map_tensor_accessors2(lhs, rhs, DataType::BOOL, [](float l, float r) -> bool { return l > r; }, cpu_allocator);
+
+        auto at = [&](nonnegative_int r, nonnegative_int c) -> bool {
+          return result.at<DataType::BOOL>(FFOrdered{r, c});
+        };
+
+        CHECK(at(0_n, 0_n) == true);
+        CHECK(at(0_n, 1_n) == true);
+        CHECK(at(0_n, 2_n) == false);
+        CHECK(at(1_n, 0_n) == false);
+        CHECK(at(1_n, 1_n) == false);
+        CHECK(at(1_n, 2_n) == false);
+      }
+    }
+
+    SUBCASE("argument types are not the same") {
+      GenericTensorAccessorW rhs = create_2d_accessor_w_with_contents<bool>(
+          {
+            {true, false, true},
+            {true, false, false},
+          },
+          cpu_allocator);
+
+      auto func = [](float l, bool r) -> double {
+        if (r) {
+          return (- l);
+        } else {
+          return l * 2;
+        }
+      };
+      GenericTensorAccessorW result = map_tensor_accessors2(lhs, rhs, DataType::DOUBLE, func, cpu_allocator);
+
+      auto at = [&](nonnegative_int r, nonnegative_int c) -> double {
+        return result.at<DataType::DOUBLE>(FFOrdered{r, c});
+      };
+
+      CHECK(at(0_n, 0_n) == -1);
+      CHECK(at(0_n, 1_n) == 6);
+      CHECK(at(0_n, 2_n) == -2);
+      CHECK(at(1_n, 0_n) == -2);
+      CHECK(at(1_n, 1_n) == 2);
+      CHECK(at(1_n, 2_n) == 10);
+    }
+  }
+}
diff --git a/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc b/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc
new file mode 100644
index 0000000000..0e69b3b937
--- /dev/null
+++ b/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc
@@ -0,0 +1,68 @@
+#include <doctest/doctest.h>
+#include "kernels/reduce_tensor_accessor.h"
+#include "internal/test_utils.h"
+#include "kernels/format_accessor_contents.h"
+#include "test/utils/doctest/check_kv.h"
+#include "kernels/create_accessor_with_contents.h"
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("reduce_tensor_accessor_in_dims") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents<int32_t>(
+        {
+          {
+            {1, 3, 2},
+            {2, 1, 5},
+          },
+          {
+            {4, 2, 1},
+            {8, 3, 6},
+          },
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorW result = reduce_tensor_accessor_in_dims(
+      accessor,
+      {ff_dim_t{0_n}, ff_dim_t{2_n}},
+      cpu_allocator,
+      [](int32_t accum, int32_t x) { return x + accum; });
+
+    GenericTensorAccessorW correct = create_1d_accessor_w_with_contents<int32_t>(
+        {
+          1 + 3 + 2 + 4 + 2 + 1,
+          2 + 1 + 5 + 8 + 3 + 6,
+        },
+        cpu_allocator);
+
+    CHECK_MESSAGE(accessors_are_equal(result, correct),
+                  check_kv("result =", format_accessor_w_contents(result)),
+                  check_kv("correct=", format_accessor_w_contents(correct)));
+  }
+
+
+  TEST_CASE("reduce_tensor_accessor_in_all_dims") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents<int32_t>(
+        {
+          {
+            {1, 3, 2},
+            {2, 1, 5},
+          },
+          {
+            {4, 2, 1},
+            {8, 3, 6},
+          },
+        },
+        cpu_allocator);
+
+    int32_t result = reduce_tensor_accessor_in_all_dims<DataType::INT32>(
+      accessor, [](int32_t accum, int32_t elem) { return accum + elem; });
+    int32_t correct = 1 + 3 + 2 + 2 + 1 + 5 + 4 + 2 + 1 + 8 + 3 + 6;
+
+    CHECK(result == correct);
+  }
+}
diff --git a/lib/kernels/test/src/kernels/tensor_accessor_reductions.cc b/lib/kernels/test/src/kernels/tensor_accessor_reductions.cc
new file mode 100644
index 0000000000..744b875ee7
--- /dev/null
+++ b/lib/kernels/test/src/kernels/tensor_accessor_reductions.cc
@@ -0,0 +1,106 @@
+#include <doctest/doctest.h>
+#include "kernels/create_accessor_with_contents.h"
+#include "kernels/local_cpu_allocator.h"
+#include "kernels/tensor_accessor_reductions.h"
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("tensor_accessor_all") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    SUBCASE("returns false if any elements are false") {
+      GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents<bool>(
+          {
+            {
+              {true, true, true},
+              {true, true, true},
+            },
+            {
+              {true, false, true},
+              {true, true, true},
+            },
+          },
+          cpu_allocator);
+
+      bool result = tensor_accessor_all(accessor);
+      bool correct = false;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("returns true if all elements are true") {
+      GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents<bool>(
+          {
+            {true, true, true},
+            {true, true, true},
+          },
+          cpu_allocator);
+
+      bool result = tensor_accessor_all(accessor);
+      bool correct = true;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("throw an error if the datatype is not bool") {
+      GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents<int32_t>(
+          {
+            {1, 0, 1},
+            {1, 1, 1},
+          },
+          cpu_allocator);
+
+      CHECK_THROWS(tensor_accessor_all(accessor));
+    }
+  }
+
+  TEST_CASE("tensor_accessor_any") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    SUBCASE("returns true if any elements are true") {
+      GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents<bool>(
+          {
+            {
+              {false, false, false},
+              {true, false, false},
+            },
+            {
+              {false, false, false},
+              {false, false, false},
+            },
+          },
+          cpu_allocator);
+
+      bool result = tensor_accessor_any(accessor);
+      bool correct = true;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("returns false if all elements are false") {
+      GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents<bool>(
+          {
+            {false, false, false},
+            {false, false, false},
+          },
+          cpu_allocator);
+
+      bool result = tensor_accessor_any(accessor);
+      bool correct = false;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("throw an error if the datatype is not bool") {
+      GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents<int32_t>(
+          {
+            {1, 0, 1},
+            {1, 1, 1},
+          },
+          cpu_allocator);
+
+      CHECK_THROWS(tensor_accessor_any(accessor));
+    }
+  }
+}
diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc
index 3a0f4ffdc4..3b024fdf55 100644
--- a/lib/kernels/test/src/test_attention_kernel.cc
+++ b/lib/kernels/test/src/test_attention_kernel.cc
@@ -6,17 +6,17 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test multi-head attention kernel") {
-    nonnegative_int num_samples = 10_n;
-    nonnegative_int num_heads = 4_n;
-    nonnegative_int qSize = 64_n;
-    nonnegative_int kSize = 64_n;
-    nonnegative_int vSize = 64_n;
-    nonnegative_int qProjSize = 64_n;
-    nonnegative_int kProjSize = 64_n;
-    nonnegative_int vProjSize = 64_n;
-    nonnegative_int oProjSize = 64_n;
-    nonnegative_int qoSeqLength = 20_n;
-    nonnegative_int kvSeqLength = 20_n;
+    positive_int num_samples = 10_p;
+    positive_int num_heads = 4_p;
+    positive_int qSize = 64_p;
+    positive_int kSize = 64_p;
+    positive_int vSize = 64_p;
+    positive_int qProjSize = 64_p;
+    positive_int kProjSize = 64_p;
+    positive_int vProjSize = 64_p;
+    positive_int oProjSize = 64_p;
+    positive_int qoSeqLength = 20_p;
+    positive_int kvSeqLength = 20_p;
 
     ManagedFFStream managed_stream{};
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
@@ -29,17 +29,17 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     MHAPerDeviceState state = Kernels::MultiHeadAttention::init_kernel(
         managed_handle.raw_handle(),
         allocator,
-        /*num_samples=*/num_samples.unwrap_nonnegative(),
-        /*num_heads=*/num_heads.unwrap_nonnegative(),
-        /*qSize=*/qSize.unwrap_nonnegative(),
-        /*kSize=*/kSize.unwrap_nonnegative(),
-        /*vSize=*/vSize.unwrap_nonnegative(),
-        /*qProjSize=*/qProjSize.unwrap_nonnegative(),
-        /*kProjSize=*/kProjSize.unwrap_nonnegative(),
-        /*vProjSize=*/vProjSize.unwrap_nonnegative(),
-        /*oProjSize=*/oProjSize.unwrap_nonnegative(),
-        /*qoSeqLength=*/qoSeqLength.unwrap_nonnegative(),
-        /*kvSeqLength=*/kvSeqLength.unwrap_nonnegative(),
+        /*num_samples=*/num_samples.int_from_positive_int(),
+        /*num_heads=*/num_heads.int_from_positive_int(),
+        /*qSize=*/qSize.int_from_positive_int(),
+        /*kSize=*/kSize.int_from_positive_int(),
+        /*vSize=*/vSize.int_from_positive_int(),
+        /*qProjSize=*/qProjSize.int_from_positive_int(),
+        /*kProjSize=*/kProjSize.int_from_positive_int(),
+        /*vProjSize=*/vProjSize.int_from_positive_int(),
+        /*oProjSize=*/oProjSize.int_from_positive_int(),
+        /*qoSeqLength=*/qoSeqLength.int_from_positive_int(),
+        /*kvSeqLength=*/kvSeqLength.int_from_positive_int(),
         /*add_bias_kv=*/false);
 
     TensorShape query_shape = TensorShape{
@@ -59,7 +59,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         DataType::FLOAT,
     };
     TensorShape weight_shape = TensorShape{
-        TensorDims{FFOrdered{nonnegative_int{state.weightSize}}},
+        TensorDims{FFOrdered{positive_int{state.weightSize}}},
         DataType::FLOAT,
     };
 
diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc
index e10a80b57f..4ca8811b9b 100644
--- a/lib/kernels/test/src/test_batch_matmul_kernel.cc
+++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc
@@ -6,10 +6,10 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test BatchMatmul Kernel") {
-    nonnegative_int m = 10_n;
-    nonnegative_int n = 10_n;
-    nonnegative_int k = 10_n;
-    nonnegative_int batch = 5_n;
+    positive_int m = 10_p;
+    positive_int n = 10_p;
+    positive_int k = 10_p;
+    positive_int batch = 5_p;
     int a_seq_length_dim = -1;
     int b_seq_length_dim = -1;
     int seq_length = -1;
@@ -48,10 +48,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
                                            output_accessor.get_float_ptr(),
                                            a_accessor.get_float_ptr(),
                                            b_accessor.get_float_ptr(),
-                                           m.unwrap_nonnegative(),
-                                           n.unwrap_nonnegative(),
-                                           k.unwrap_nonnegative(),
-                                           batch.unwrap_nonnegative(),
+                                           m.int_from_positive_int(),
+                                           n.int_from_positive_int(),
+                                           k.int_from_positive_int(),
+                                           batch.int_from_positive_int(),
                                            a_seq_length_dim,
                                            b_seq_length_dim,
                                            seq_length);
@@ -73,10 +73,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
                                             a_grad_accessor.get_float_ptr(),
                                             b_accessor.get_float_ptr(),
                                             b_grad_accessor.get_float_ptr(),
-                                            m.unwrap_nonnegative(),
-                                            n.unwrap_nonnegative(),
-                                            k.unwrap_nonnegative(),
-                                            batch.unwrap_nonnegative());
+                                            m.int_from_positive_int(),
+                                            n.int_from_positive_int(),
+                                            k.int_from_positive_int(),
+                                            batch.int_from_positive_int());
     }
   }
 }
diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc
index c9a1bf05e6..00a26c3303 100644
--- a/lib/kernels/test/src/test_batch_norm_kernel.cc
+++ b/lib/kernels/test/src/test_batch_norm_kernel.cc
@@ -7,10 +7,10 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test BatchNorm Kernel") {
-    nonnegative_int output_n = 1_n;
-    nonnegative_int output_c = 10_n;
-    nonnegative_int output_h = 10_n;
-    nonnegative_int output_w = 10_n;
+    positive_int output_n = 1_p;
+    positive_int output_c = 10_p;
+    positive_int output_h = 10_p;
+    positive_int output_w = 10_p;
 
     ManagedFFStream managed_stream{};
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
@@ -24,10 +24,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         /*handle=*/managed_handle.raw_handle(),
         /*allocator=*/allocator,
         /*runningMean=*/nullptr,
-        /*output_n=*/output_n.unwrap_nonnegative(),
-        /*output_c=*/output_c.unwrap_nonnegative(),
-        /*output_h=*/output_h.unwrap_nonnegative(),
-        /*output_w=*/output_w.unwrap_nonnegative(),
+        /*output_n=*/output_n.int_from_positive_int(),
+        /*output_c=*/output_c.int_from_positive_int(),
+        /*output_h=*/output_h.int_from_positive_int(),
+        /*output_w=*/output_w.int_from_positive_int(),
         /*relu=*/true);
 
     TensorShape input_shape = TensorShape{
@@ -90,7 +90,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
           /*scale_grad_ptr=*/scale_grad_accessor.get_float_ptr(),
           /*bias_grad_ptr=*/bias_grad_accessor.get_float_ptr(),
           /*numElements=*/
-          input_accessor.shape.num_elements().unwrap_nonnegative());
+          input_accessor.shape.num_elements().int_from_positive_int());
 
       CHECK(contains_non_zero(input_grad_accessor));
       CHECK(contains_non_zero(scale_grad_accessor));
diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc
index 0c41fe12ac..7539b2457c 100644
--- a/lib/kernels/test/src/test_cast_kernel.cc
+++ b/lib/kernels/test/src/test_cast_kernel.cc
@@ -11,11 +11,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     Allocator allocator = create_local_cuda_memory_allocator();
 
     TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered{100_n, 100_n}},
+        TensorDims{FFOrdered{100_p, 100_p}},
         DataType::FLOAT,
     };
     TensorShape output_shape = TensorShape{
-        TensorDims{FFOrdered{100_n, 100_n}},
+        TensorDims{FFOrdered{100_p, 100_p}},
         DataType::DOUBLE,
     };
 
@@ -52,11 +52,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
     TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered{10_n, 2_n}},
+        TensorDims{FFOrdered{10_p, 2_p}},
         DataType::FLOAT,
     };
     TensorShape output_shape = TensorShape{
-        TensorDims{FFOrdered{10_n, 2_n}},
+        TensorDims{FFOrdered{10_p, 2_p}},
         DataType::DOUBLE,
     };
 
diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc
index ddcb0d8c49..6ce415d48c 100644
--- a/lib/kernels/test/src/test_combine_kernel.cc
+++ b/lib/kernels/test/src/test_combine_kernel.cc
@@ -15,7 +15,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     Allocator allocator = create_local_cuda_memory_allocator();
 
     TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered{100_n, 100_n}},
+        TensorDims{FFOrdered{100_p, 100_p}},
         DataType::FLOAT,
     };
     TensorShape output_shape = input_shape;
@@ -53,7 +53,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
     TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered{5_n, 5_n}},
+        TensorDims{FFOrdered{5_p, 5_p}},
         DataType::FLOAT,
     };
     TensorShape output_shape = input_shape;
diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc
index 20ebb52161..b22add8905 100644
--- a/lib/kernels/test/src/test_concat_kernel.cc
+++ b/lib/kernels/test/src/test_concat_kernel.cc
@@ -13,11 +13,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     ManagedFFStream managed_stream{};
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    const nonnegative_int num_inputs = 4_n;
+    const positive_int num_inputs = 4_p;
 
     SUBCASE("forward_kernel") {
-      auto run_forward_test = [&](nonnegative_int input_rows,
-                                  nonnegative_int input_cols,
+      auto run_forward_test = [&](positive_int input_rows,
+                                  positive_int input_cols,
                                   TensorShape output_shape,
                                   ff_dim_t concat_axis) {
         TensorShape input_shape = TensorShape{
@@ -26,7 +26,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         };
 
         std::vector<GenericTensorAccessorR> input_accessors =
-            repeat(num_inputs, [&]() {
+            repeat(num_inputs.nonnegative_int_from_positive_int(), [&]() {
               return create_random_filled_accessor_r(input_shape, allocator);
             });
 
@@ -42,8 +42,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       };
 
       SUBCASE("test forward concat, axis = 0") {
-        nonnegative_int input_rows = 2_n;
-        nonnegative_int input_cols = 4_n;
+        positive_int input_rows = 2_p;
+        positive_int input_cols = 4_p;
         TensorShape output_shape = TensorShape{
             TensorDims{FFOrdered{num_inputs * input_rows, input_cols}},
             DataType::FLOAT,
@@ -52,8 +52,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       }
 
       SUBCASE("test forward concat, axis = 1") {
-        nonnegative_int input_rows = 4_n;
-        nonnegative_int input_cols = 2_n;
+        positive_int input_rows = 4_p;
+        positive_int input_cols = 2_p;
         TensorShape output_shape = TensorShape{
             TensorDims{FFOrdered{input_rows, num_inputs * input_cols}},
             DataType::FLOAT,
@@ -63,8 +63,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     }
 
     SUBCASE("backward_kernel") {
-      auto run_backward_test = [&](nonnegative_int input_rows,
-                                   nonnegative_int input_cols,
+      auto run_backward_test = [&](positive_int input_rows,
+                                   positive_int input_cols,
                                    TensorShape output_shape,
                                    ff_dim_t concat_axis) {
         TensorShape input_shape = TensorShape{
@@ -76,7 +76,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
             create_random_filled_accessor_r(output_shape, allocator);
 
         std::vector<GenericTensorAccessorW> input_grad_accessors =
-            repeat(num_inputs, [&]() {
+            repeat(num_inputs.nonnegative_int_from_positive_int(), [&]() {
               return create_zero_filled_accessor_w(input_shape, allocator);
             });
 
@@ -91,8 +91,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       };
 
       SUBCASE("test backward concat, axis = 0") {
-        nonnegative_int input_rows = 2_n;
-        nonnegative_int input_cols = 4_n;
+        positive_int input_rows = 2_p;
+        positive_int input_cols = 4_p;
         TensorShape output_shape = TensorShape{
             TensorDims{FFOrdered{num_inputs * input_rows, input_cols}},
             DataType::FLOAT,
@@ -101,8 +101,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       }
 
       SUBCASE("test backward concat, axis = 1") {
-        nonnegative_int input_rows = 4_n;
-        nonnegative_int input_cols = 2_n;
+        positive_int input_rows = 4_p;
+        positive_int input_cols = 2_p;
         TensorShape output_shape = TensorShape{
             TensorDims{FFOrdered{input_rows, num_inputs * input_cols}},
             DataType::FLOAT,
diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc
index 8379e062d5..1b224084f8 100644
--- a/lib/kernels/test/src/test_dropout.cc
+++ b/lib/kernels/test/src/test_dropout.cc
@@ -10,11 +10,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     float dropout_rate = 0.1;
 
     ArrayShape shape = ArrayShape{
-        std::vector{10_n, 10_n},
+        std::vector{10_p, 10_p},
     };
 
     TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered{10_n, 10_n}},
+        TensorDims{FFOrdered{10_p, 10_p}},
         DataType::FLOAT,
     };
     TensorShape output_shape = input_shape;
diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc
index dd44b8f50c..98896cca18 100644
--- a/lib/kernels/test/src/test_flat_kernel.cc
+++ b/lib/kernels/test/src/test_flat_kernel.cc
@@ -15,7 +15,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     ManagedFFStream managed_stream{};
 
     TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered{100_n}},
+        TensorDims{FFOrdered{100_p}},
         DataType::FLOAT,
     };
     TensorShape output_shape = input_shape;
diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc
index c387899709..52389ea0f5 100644
--- a/lib/kernels/test/src/test_gather_kernels.cc
+++ b/lib/kernels/test/src/test_gather_kernels.cc
@@ -39,15 +39,15 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
       SUBCASE("test gather forward, 2D") {
         TensorShape input_shape = TensorShape{
-            TensorDims{FFOrdered{2_n, 100_n}},
+            TensorDims{FFOrdered{2_p, 100_p}},
             DataType::FLOAT,
         };
         TensorShape index_shape = TensorShape{
-            TensorDims{FFOrdered{2_n, 20_n}},
+            TensorDims{FFOrdered{2_p, 20_p}},
             DataType::INT32,
         };
         TensorShape output_shape = TensorShape{
-            TensorDims{FFOrdered{2_n, 20_n}},
+            TensorDims{FFOrdered{2_p, 20_p}},
             DataType::FLOAT,
         };
         run_forward_test(input_shape, index_shape, output_shape);
@@ -55,15 +55,15 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
       SUBCASE("test gather forward, 1D") {
         TensorShape input_shape = TensorShape{
-            TensorDims{FFOrdered{100_n}},
+            TensorDims{FFOrdered{100_p}},
             DataType::FLOAT,
         };
         TensorShape index_shape = TensorShape{
-            TensorDims{FFOrdered{10_n}},
+            TensorDims{FFOrdered{10_p}},
             DataType::INT32,
         };
         TensorShape output_shape = TensorShape{
-            TensorDims{FFOrdered{10_n}},
+            TensorDims{FFOrdered{10_p}},
             DataType::FLOAT,
         };
         run_forward_test(input_shape, index_shape, output_shape);
@@ -91,15 +91,15 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
       SUBCASE("test gather backward, 2D") {
         TensorShape input_shape = TensorShape{
-            TensorDims{FFOrdered{2_n, 100_n}},
+            TensorDims{FFOrdered{2_p, 100_p}},
             DataType::FLOAT,
         };
         TensorShape index_shape = TensorShape{
-            TensorDims{FFOrdered{2_n, 25_n}},
+            TensorDims{FFOrdered{2_p, 25_p}},
             DataType::INT32,
         };
         TensorShape output_shape = TensorShape{
-            TensorDims{FFOrdered{2_n, 25_n}},
+            TensorDims{FFOrdered{2_p, 25_p}},
             DataType::FLOAT,
         };
         run_backward_test(input_shape, index_shape, output_shape);
diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc
index eb62784369..4f3b701bba 100644
--- a/lib/kernels/test/src/test_layer_norm_kernels.cc
+++ b/lib/kernels/test/src/test_layer_norm_kernels.cc
@@ -7,8 +7,8 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test LayerNorm Forward and Backward Kernel") {
-    nonnegative_int batch_size = 10_n;
-    nonnegative_int feature_size = 10_n;
+    positive_int batch_size = 10_p;
+    positive_int feature_size = 10_p;
     float epsilon = 1e-5f;
     bool elementwise_affine = true;
 
@@ -34,8 +34,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         Kernels::LayerNorm::init_kernel(managed_handle.raw_handle(),
                                         allocator,
                                         elementwise_affine,
-                                        batch_size.unwrap_nonnegative(),
-                                        feature_size.unwrap_nonnegative(),
+                                        batch_size.int_from_positive_int(),
+                                        feature_size.int_from_positive_int(),
                                         epsilon);
 
     GenericTensorAccessorR input_accessor =
diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc
index 9243601766..099536ce0d 100644
--- a/lib/kernels/test/src/test_managed_ff_stream.cc
+++ b/lib/kernels/test/src/test_managed_ff_stream.cc
@@ -38,15 +38,15 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
       SUBCASE("test gather forward, 2D") {
         TensorShape input_shape = TensorShape{
-            TensorDims{FFOrdered{2_n, 100_n}},
+            TensorDims{FFOrdered{2_p, 100_p}},
             DataType::FLOAT,
         };
         TensorShape index_shape = TensorShape{
-            TensorDims{FFOrdered{2_n, 20_n}},
+            TensorDims{FFOrdered{2_p, 20_p}},
             DataType::INT32,
         };
         TensorShape output_shape = TensorShape{
-            TensorDims{FFOrdered{2_n, 20_n}},
+            TensorDims{FFOrdered{2_p, 20_p}},
             DataType::FLOAT,
         };
         run_forward_test(input_shape, index_shape, output_shape);
@@ -54,15 +54,15 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
       SUBCASE("test gather forward, 1D") {
         TensorShape input_shape = TensorShape{
-            TensorDims{FFOrdered{100_n}},
+            TensorDims{FFOrdered{100_p}},
             DataType::FLOAT,
         };
         TensorShape index_shape = TensorShape{
-            TensorDims{FFOrdered{10_n}},
+            TensorDims{FFOrdered{10_p}},
             DataType::INT32,
         };
         TensorShape output_shape = TensorShape{
-            TensorDims{FFOrdered{10_n}},
+            TensorDims{FFOrdered{10_p}},
             DataType::FLOAT,
         };
         run_forward_test(input_shape, index_shape, output_shape);
@@ -90,15 +90,15 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
       SUBCASE("test gather backward, 2D") {
         TensorShape input_shape = TensorShape{
-            TensorDims{FFOrdered{2_n, 100_n}},
+            TensorDims{FFOrdered{2_p, 100_p}},
             DataType::FLOAT,
         };
         TensorShape index_shape = TensorShape{
-            TensorDims{FFOrdered{2_n, 25_n}},
+            TensorDims{FFOrdered{2_p, 25_p}},
             DataType::INT32,
         };
         TensorShape output_shape = TensorShape{
-            TensorDims{FFOrdered{2_n, 25_n}},
+            TensorDims{FFOrdered{2_p, 25_p}},
             DataType::FLOAT,
         };
         run_backward_test(input_shape, index_shape, output_shape);
diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc
index 283b465abc..94ce8f4848 100644
--- a/lib/kernels/test/src/test_partition_kernel.cc
+++ b/lib/kernels/test/src/test_partition_kernel.cc
@@ -19,7 +19,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         managed_handle.raw_handle(), DataType::FLOAT);
 
     TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered{10_n, 10_n}},
+        TensorDims{FFOrdered{10_p, 10_p}},
         DataType::FLOAT,
     };
     TensorShape output_shape = input_shape;
diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc
index ceca1d94dd..7691daf7a6 100644
--- a/lib/kernels/test/src/test_pool_2d_kernels.cc
+++ b/lib/kernels/test/src/test_pool_2d_kernels.cc
@@ -6,20 +6,20 @@
 using namespace ::FlexFlow;
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Pool2D Forward and Backward Kernel") {
-    nonnegative_int input_w = 10_n;
-    nonnegative_int input_h = 10_n;
-    nonnegative_int input_c = 3_n;
-    nonnegative_int input_n = 1_n;
-    nonnegative_int output_w = 5_n;
-    nonnegative_int output_h = 5_n;
-    nonnegative_int output_c = 3_n;
-    nonnegative_int output_n = 1_n;
+    positive_int input_w = 10_p;
+    positive_int input_h = 10_p;
+    positive_int input_c = 3_p;
+    positive_int input_n = 1_p;
+    positive_int output_w = 5_p;
+    positive_int output_h = 5_p;
+    positive_int output_c = 3_p;
+    positive_int output_n = 1_p;
     nonnegative_int pad_h = 0_n;
     nonnegative_int pad_w = 0_n;
-    nonnegative_int kernel_h = 2_n;
-    nonnegative_int kernel_w = 2_n;
-    nonnegative_int stride_h = 2_n;
-    nonnegative_int stride_w = 2_n;
+    positive_int kernel_h = 2_p;
+    positive_int kernel_w = 2_p;
+    positive_int stride_h = 2_p;
+    positive_int stride_w = 2_p;
 
     PoolOp pool_type = PoolOp::MAX;
 
@@ -34,20 +34,20 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     Pool2DPerDeviceState state =
         Kernels::Pool2D::init_kernel(/*handle=*/managed_handle.raw_handle(),
                                      /*activation=*/std::nullopt,
-                                     /*input_w=*/input_w.unwrap_nonnegative(),
-                                     /*input_h=*/input_h.unwrap_nonnegative(),
-                                     /*input_c=*/input_c.unwrap_nonnegative(),
-                                     /*input_n=*/input_n.unwrap_nonnegative(),
-                                     /*output_w=*/output_w.unwrap_nonnegative(),
-                                     /*output_h=*/output_h.unwrap_nonnegative(),
-                                     /*output_c=*/output_c.unwrap_nonnegative(),
-                                     /*output_n=*/output_n.unwrap_nonnegative(),
+                                     /*input_w=*/input_w.int_from_positive_int(),
+                                     /*input_h=*/input_h.int_from_positive_int(),
+                                     /*input_c=*/input_c.int_from_positive_int(),
+                                     /*input_n=*/input_n.int_from_positive_int(),
+                                     /*output_w=*/output_w.int_from_positive_int(),
+                                     /*output_h=*/output_h.int_from_positive_int(),
+                                     /*output_c=*/output_c.int_from_positive_int(),
+                                     /*output_n=*/output_n.int_from_positive_int(),
                                      /*pad_h=*/pad_h.unwrap_nonnegative(),
                                      /*pad_w=*/pad_w.unwrap_nonnegative(),
-                                     /*kernel_h=*/kernel_h.unwrap_nonnegative(),
-                                     /*kernel_w=*/kernel_w.unwrap_nonnegative(),
-                                     /*stride_h=*/stride_h.unwrap_nonnegative(),
-                                     /*stride_w=*/stride_w.unwrap_nonnegative(),
+                                     /*kernel_h=*/kernel_h.int_from_positive_int(),
+                                     /*kernel_w=*/kernel_w.int_from_positive_int(),
+                                     /*stride_h=*/stride_h.int_from_positive_int(),
+                                     /*stride_w=*/stride_w.int_from_positive_int(),
                                      /*pool_type=*/pool_type);
 
     TensorShape input_shape = TensorShape{
diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc
index b7990d84fa..16b03d34d9 100644
--- a/lib/kernels/test/src/test_reduction_kernel.cc
+++ b/lib/kernels/test/src/test_reduction_kernel.cc
@@ -9,7 +9,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     std::size_t num_replicas = 5;
 
     TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered{10_n, 10_n, 10_n, 10_n, 10_n}},
+        TensorDims{FFOrdered{10_p, 10_p, 10_p, 10_p, 10_p}},
         DataType::FLOAT,
     };
 
@@ -23,7 +23,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     SUBCASE("forward_kernel") {
       TensorShape output_shape = TensorShape{
-          TensorDims{FFOrdered{10_n}},
+          TensorDims{FFOrdered{10_p}},
           DataType::FLOAT,
       };
 
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index ceb0915c03..95989776c1 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -13,11 +13,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     nonnegative_int num_replicas = 10_n;
 
     TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered{3_n}},
+        TensorDims{FFOrdered{3_p}},
         DataType::FLOAT,
     };
     TensorShape output_shape = TensorShape{
-        TensorDims{FFOrdered{3_n}},
+        TensorDims{FFOrdered{3_p}},
         DataType::FLOAT,
     };
 
@@ -73,14 +73,14 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
   }
 
   TEST_CASE("Check Replicate Forward and Backward Kernel against CPU Kernel") {
-    nonnegative_int num_replicas = 2_n;
+    positive_int num_replicas = 2_p;
 
     TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered{5_n}},
+        TensorDims{FFOrdered{5_p}},
         DataType::FLOAT,
     };
     TensorShape output_shape = TensorShape{
-        TensorDims{FFOrdered{5_n, num_replicas}},
+        TensorDims{FFOrdered{5_p, num_replicas}},
         DataType::FLOAT,
     };
 
@@ -129,7 +129,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       Kernels::Replicate::backward_kernel(managed_stream.raw_stream(),
                                           output_grad_accessor_gpu,
                                           input_grad_accessor_gpu,
-                                          num_replicas.unwrap_nonnegative());
+                                          num_replicas.int_from_positive_int());
 
       // Run CPU Replicate Backward Kernel
       GenericTensorAccessorR output_grad_accessor_cpu =
@@ -140,7 +140,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       Kernels::Replicate::cpu_backward_kernel(
           output_grad_accessor_cpu,
           input_grad_accessor_cpu,
-          num_replicas.unwrap_nonnegative());
+          num_replicas.int_from_positive_int());
 
       CHECK_MESSAGE(
           accessors_are_equal(input_grad_accessor_gpu, input_grad_accessor_cpu),
diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc
index 69f0a1f214..8c851e877e 100644
--- a/lib/kernels/test/src/test_reshape_kernel.cc
+++ b/lib/kernels/test/src/test_reshape_kernel.cc
@@ -14,7 +14,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     Allocator allocator = create_local_cuda_memory_allocator();
 
     TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered{100_n}},
+        TensorDims{FFOrdered{100_p}},
         DataType::FLOAT,
     };
     TensorShape output_shape = input_shape;
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index f2ddb2c67b..b9f97bc5cd 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -8,7 +8,7 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Call Reverse Forward and Backward Kernels") {
     TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered{1_n, 10_n, 10_n}},
+        TensorDims{FFOrdered{1_p, 10_p, 10_p}},
         DataType::FLOAT,
     };
     TensorShape output_shape = input_shape;
@@ -55,7 +55,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
   TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") {
     TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered{1_n, 4_n, 3_n}},
+        TensorDims{FFOrdered{1_p, 4_p, 3_p}},
         DataType::FLOAT,
     };
     TensorShape output_shape = input_shape;
diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc
index 0d5dcb79a2..dc8cb276ab 100644
--- a/lib/kernels/test/src/test_softmax_kernel.cc
+++ b/lib/kernels/test/src/test_softmax_kernel.cc
@@ -21,7 +21,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     Allocator allocator = create_local_cuda_memory_allocator();
 
     TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered{100_n}},
+        TensorDims{FFOrdered{100_p}},
         DataType::FLOAT,
     };
     TensorShape output_shape = input_shape;
@@ -59,7 +59,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
           managed_stream.raw_stream(),
           output_grad_accessor.get_float_ptr(),
           input_grad_accessor.get_float_ptr(),
-          output_grad_accessor.shape.num_elements().unwrap_nonnegative());
+          output_grad_accessor.shape.num_elements().int_from_positive_int());
 
       CHECK(contains_non_zero(input_grad_accessor));
     }
diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc
index d8ddb8c4b9..d51d0e40f5 100644
--- a/lib/kernels/test/src/test_split_kernel.cc
+++ b/lib/kernels/test/src/test_split_kernel.cc
@@ -22,11 +22,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     Allocator allocator = create_local_cuda_memory_allocator();
 
     TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered{100_n}},
+        TensorDims{FFOrdered{100_p}},
         DataType::FLOAT,
     };
     TensorShape output_shape = TensorShape{
-        TensorDims{FFOrdered{50_n}},
+        TensorDims{FFOrdered{50_p}},
         DataType::FLOAT,
     };
 
diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc
index e2042c1e2c..06b5add3c7 100644
--- a/lib/kernels/test/src/test_transpose_kernel.cc
+++ b/lib/kernels/test/src/test_transpose_kernel.cc
@@ -21,7 +21,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     Allocator allocator = create_local_cuda_memory_allocator();
 
     TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered{10_n, 10_n}},
+        TensorDims{FFOrdered{10_p, 10_p}},
         DataType::FLOAT,
     };
     TensorShape output_shape = input_shape;
diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc
index 15ebdd5f28..974e580b8e 100644
--- a/lib/local-execution/src/loss_functions.cc
+++ b/lib/local-execution/src/loss_functions.cc
@@ -56,14 +56,14 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
   auto logit = acc.get_tensor<Permissions::RO>(LOGIT);
   auto label = acc.get_loss_tensor<Permissions::RO>(LABEL);
   int batch_size =
-      logit.shape.at(legion_dim_t{nonnegative_int{1}}).unwrap_nonnegative();
+      logit.shape.at(legion_dim_t{1_n}).int_from_positive_int();
   // assuming logit shape is [batch dim, num classes]
 
   LossFunction loss_type = get_loss_function(attrs);
   float scale_factor = 1.0f / batch_size;
   if (loss_type == LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE) {
-    assert(logit.shape.get_volume() == label.shape.get_volume());
-    scale_factor = 2.0f / logit.shape.get_volume().unwrap_nonnegative();
+    ASSERT(logit.shape.num_elements() == label.shape.num_elements());
+    scale_factor = 2.0f / logit.shape.num_elements().int_from_positive_int();
   }
 
   if (loss_type == LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY) {
@@ -71,27 +71,27 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
     auto scce_attrs = attrs.get<SparseCategoricalCrossEntropyLossAttrs>();
     size_t ndim = logit.shape.num_dims().unwrap_nonnegative();
     int num_classes =
-        logit.shape.at(legion_dim_t{nonnegative_int{0}}).unwrap_nonnegative();
-    assert(logit_grad.shape == logit.shape);
+        logit.shape.at(legion_dim_t{0_n}).int_from_positive_int();
+    ASSERT(logit_grad.shape == logit.shape);
     int k = 1;
     if (scce_attrs.replace_labels) {
       k = logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1}))
-              .unwrap_nonnegative() /
+              .int_from_positive_int() /
           label.shape.at(legion_dim_t(nonnegative_int{ndim - 1}))
-              .unwrap_nonnegative(); // TODO FIXME something seems wrong here,
+              .int_from_positive_int(); // TODO FIXME something seems wrong here,
                                      // isn't the numerator guaranteed to be 1?
                                      // <--- this is not the case because of the
                                      // potential parallel dim
     }
-    assert(
-        label.shape.sub_shape(legion_dim_t(nonnegative_int{1}), std::nullopt) ==
-        logit.shape.sub_shape(legion_dim_t(nonnegative_int{1}), std::nullopt));
-    assert(k * label.shape.at(legion_dim_t(nonnegative_int{ndim - 1}))
-                   .unwrap_nonnegative() ==
+    ASSERT(
+        label.shape.sub_shape(legion_dim_t(1_n), std::nullopt) ==
+        logit.shape.sub_shape(legion_dim_t(1_n), std::nullopt));
+    ASSERT(k * label.shape.at(legion_dim_t(nonnegative_int{ndim - 1}))
+                   .int_from_positive_int() ==
            logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1}))
-               .unwrap_nonnegative());
-    assert(
-        label.shape.at(legion_dim_t(nonnegative_int{0})).unwrap_nonnegative() ==
+               .int_from_positive_int());
+    ASSERT(
+        label.shape.at(legion_dim_t(0_n)).int_from_positive_int() ==
         1);
 
     profile(sparse_categorical_crossentropy_loss_backward_kernel,
@@ -100,17 +100,17 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
             get_float_ptr(logit_grad),
             get_float_ptr(logit),
             reinterpret_cast<int const *>(get_float_ptr(label)),
-            get_volume(logit.shape).unwrap_nonnegative(),
-            get_volume(logit_grad.shape).unwrap_nonnegative(),
+            get_num_elements(logit.shape).int_from_positive_int(),
+            get_num_elements(logit_grad.shape).int_from_positive_int(),
             batch_size,
             num_classes,
             k,
             scale_factor);
   } else {
-    assert(logit.shape == label.shape);
-    assert(logit_grad.shape == logit.shape);
+    ASSERT(logit.shape == label.shape);
+    ASSERT(logit_grad.shape == logit.shape);
     int num_channels =
-        logit.shape.at(legion_dim_t{nonnegative_int{0}}).unwrap_nonnegative();
+        logit.shape.at(legion_dim_t{0_n}).int_from_positive_int();
     switch (loss_type) {
       case LossFunction::CATEGORICAL_CROSSENTROPY: {
         profile(categorical_crossentropy_loss_backward_kernel,
@@ -119,8 +119,8 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
                 get_float_ptr(logit_grad),
                 get_float_ptr(logit),
                 get_float_ptr(label),
-                get_volume(logit.shape).unwrap_nonnegative(),
-                get_volume(logit_grad.shape).unwrap_nonnegative(),
+                get_num_elements(logit.shape).int_from_positive_int(),
+                get_num_elements(logit_grad.shape).int_from_positive_int(),
                 scale_factor);
         break;
       }
@@ -131,8 +131,8 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
                 get_float_ptr(logit_grad),
                 get_float_ptr(logit),
                 get_float_ptr(label),
-                get_volume(logit.shape).unwrap_nonnegative(),
-                get_volume(logit_grad.shape).unwrap_nonnegative(),
+                get_num_elements(logit.shape).int_from_positive_int(),
+                get_num_elements(logit_grad.shape).int_from_positive_int(),
                 scale_factor);
         break;
       }
@@ -142,13 +142,13 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
                 "[IdentityLoss] backward_time = %.2lfms\n",
                 get_float_ptr(logit_grad),
                 get_float_ptr(logit),
-                get_volume(logit.shape).unwrap_nonnegative(),
-                get_volume(logit_grad.shape).unwrap_nonnegative(),
+                get_num_elements(logit.shape).int_from_positive_int(),
+                get_num_elements(logit_grad.shape).int_from_positive_int(),
                 scale_factor);
         break;
       }
       default:
-        throw mk_runtime_error(fmt::format(
+        PANIC(fmt::format(
             "Unsupported loss function {}. Please report this as an issue.",
             loss_type));
     }
diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc
index 1b8fc37b2d..1d65172e67 100644
--- a/lib/local-execution/src/optimizer.cc
+++ b/lib/local-execution/src/optimizer.cc
@@ -66,18 +66,18 @@ static void sgd_update_task_impl(TaskArgumentAccessor const &acc) {
   auto weight = acc.get_tensor<Permissions::RW>(WEIGHT);
   auto profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
-  assert(weight.shape == weight_grad.shape);
-  int size = weight_grad.shape.get_volume().unwrap_nonnegative();
+  ASSERT(weight.shape == weight_grad.shape);
+  int size = weight_grad.shape.num_elements().int_from_positive_int();
 
-  assert(weight_grad.shape.get_volume().unwrap_nonnegative() &
-         weight.shape.get_volume().unwrap_nonnegative());
-  int num_replicas = weight_grad.shape.get_volume().unwrap_nonnegative() /
-                     weight.shape.get_volume().unwrap_nonnegative();
+  ASSERT(weight_grad.shape.num_elements().int_from_positive_int() &
+         weight.shape.num_elements().int_from_positive_int());
+  int num_replicas = weight_grad.shape.num_elements().int_from_positive_int() /
+                     weight.shape.num_elements().int_from_positive_int();
 
   float *sgd_v_ptr;
   if (attrs.momentum > 0.0f) {
     auto sgd_v = acc.get_optimizer_tensor<Permissions::RW>(SGD_V);
-    assert(sgd_v.shape == weight.shape);
+    ASSERT(sgd_v.shape == weight.shape);
     sgd_v_ptr = sgd_v.get_float_ptr();
   }
 
@@ -180,14 +180,10 @@ static void adam_update_task_impl(TaskArgumentAccessor const &acc) {
 
   auto profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
-  assert(weight.shape == weight_grad.shape);
-  int size = weight_grad.shape.get_volume().unwrap_nonnegative();
+  ASSERT(weight.shape == weight_grad.shape);
+  int size = weight_grad.shape.num_elements().int_from_positive_int();
 
-  assert(weight_grad.shape.get_volume().unwrap_nonnegative() %
-             weight.shape.get_volume().unwrap_nonnegative() ==
-         0);
-  int num_replicas = weight_grad.shape.get_volume().unwrap_nonnegative() /
-                     weight.shape.get_volume().unwrap_nonnegative();
+  ASSERT(weight_grad.shape.num_elements() % weight.shape.num_elements() == 0);
 
   auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
   profile(adam_nccl_update_task_gpu,
@@ -198,9 +194,9 @@ static void adam_update_task_impl(TaskArgumentAccessor const &acc) {
           attrs.beta2,
           attrs.weight_decay,
           attrs.epsilon,
-          size,
           handle,
           weight_grad.get_float_ptr(),
+          size,
           m_tensor.get_float_ptr(),
           v_tensor.get_float_ptr(),
           weight.get_float_ptr()); // how to deal with removal of ParamSync?
diff --git a/lib/local-execution/test/src/test_allocated_tensors.cc b/lib/local-execution/test/src/test_allocated_tensors.cc
index 45fc8e0a1c..971b09356c 100644
--- a/lib/local-execution/test/src/test_allocated_tensors.cc
+++ b/lib/local-execution/test/src/test_allocated_tensors.cc
@@ -1,6 +1,6 @@
 #include "local-execution/allocated_tensors.h"
 #include "local-execution/gradient_tensor_source.h"
-#include "local-execution/local_cpu_allocator.h"
+#include "kernels/local_cpu_allocator.h"
 #include "local-execution/loss_tensor_source.h"
 #include "local-execution/optimizer_tensor_source.h"
 #include "pcg/computation_graph.dtg.h"
@@ -29,15 +29,15 @@ TEST_SUITE(FF_TEST_SUITE) {
     tensor_guid_t dangling_tensor = tensor_guid_source.new_mock_tensor_guid();
 
     TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{
-        TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 10_n}},
+        TensorShape{TensorDims{FFOrdered{16_p, 10_p}},
                     DataType::FLOAT},
         CreateGrad::NO};
     TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{
-        TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 20_n}},
+        TensorShape{TensorDims{FFOrdered{16_p, 20_p}},
                     DataType::FLOAT},
         CreateGrad::NO};
     TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{
-        TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 30_n}},
+        TensorShape{TensorDims{FFOrdered{16_p, 30_p}},
                     DataType::FLOAT},
         CreateGrad::YES};
 
diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc
index 6dabe09799..2494ff1943 100644
--- a/lib/local-execution/test/src/test_e2e.cc
+++ b/lib/local-execution/test/src/test_e2e.cc
@@ -1,8 +1,10 @@
+#include "kernels/compare_tensor_accessors.h"
 #include "kernels/copy_tensor_accessor.h"
 #include "kernels/local_cpu_allocator.h"
 #include "kernels/local_cuda_allocator.h"
 #include "kernels/managed_ff_stream.h"
 #include "kernels/managed_per_device_ff_handle.h"
+#include "kernels/tensor_accessor_reductions.h"
 #include "local-execution/allocated_tensors.h"
 #include "local-execution/local_training_backing.h"
 #include "local-execution/model_training_instance.h"
@@ -20,12 +22,9 @@ bool did_loss_decrease(
   GenericTensorAccessorR const &first_epoch, 
   GenericTensorAccessorR const &last_epoch
 ) {
-  for (int i = 0; i < batch_size; i++) {
-    if (first_epoch[i] < last_epoch[i]) {
-      return false;
-    }
-  }
-  return true;
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+  return tensor_accessor_all(compare_tensor_accessors_le(last_epoch, first_epoch, cpu_allocator));
 }
 
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
@@ -43,13 +42,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     LossTensorSource loss_tensor_source;
     loss_tensor_t label_tensor = loss_tensor_source.new_loss_tensor();
 
-    nonnegative_int batch_size = 10_n;
-    nonnegative_int data_dim = 16_n;
-    nonnegative_int hidden_dim = 32_n;
-    nonnegative_int output_dim = 1_n;
+    positive_int batch_size = 10_p;
+    positive_int data_dim = 16_p;
+    positive_int hidden_dim = 32_p;
+    positive_int output_dim = 1_p;
 
     TensorShape output_tensor_shape = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{batch_size, output_dim}},
+        TensorDims{FFOrdered{batch_size, output_dim}},
         DataType::FLOAT};
 
     GenericTensorAccessorW label_tensor_backing =
@@ -66,14 +65,14 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     ComputationGraph computation_graph = make_empty_computation_graph();
 
     TensorShape input_tensor_shape = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{batch_size, data_dim}},
+        TensorDims{FFOrdered{batch_size, data_dim}},
         DataType::FLOAT};
 
     TensorShape weight_shape_1 = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{data_dim, hidden_dim}},
+        TensorDims{FFOrdered{data_dim, hidden_dim}},
         DataType::FLOAT};
     TensorShape weight_shape_2 = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{hidden_dim, output_dim}},
+        TensorDims{FFOrdered{hidden_dim, output_dim}},
         DataType::FLOAT};
 
     LayerAddedResult inputs_layer =
@@ -173,7 +172,6 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     // the first epoch
     GenericTensorAccessorR first_epoch_loss = loss_values.at(0);
     GenericTensorAccessorR last_epoch = loss_values.back();
-    CHECK(did_loss_decrease(
-        first_epoch_loss, last_epoch, batch_size.unwrap_nonnegative()));
+    CHECK(did_loss_decrease( first_epoch_loss, last_epoch));
   }
 }
diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc
index 4d015f4cfa..71148d06c1 100644
--- a/lib/local-execution/test/src/test_local_cost_estimator.cc
+++ b/lib/local-execution/test/src/test_local_cost_estimator.cc
@@ -9,7 +9,7 @@
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("LocalCostEstimator") {
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
       /*workSpaceSize=*/1024 * 1024,
@@ -25,8 +25,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     LocalCostEstimator cost_estimator = LocalCostEstimator{runtime_arg_config};
 
     SUBCASE("Estimate cost -- Attention Op") {
-      nonnegative_int embed_dim = 32_n;
-      nonnegative_int num_heads = 10_n;
+      positive_int embed_dim = 32_p;
+      positive_int num_heads = 10_p;
       MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{
           /*embed_dim=*/embed_dim,
           /*num_heads=*/num_heads,
@@ -38,14 +38,14 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*add_zero_attn=*/false,
       };
 
-      nonnegative_int batch_size = 40_n;
-      nonnegative_int seq_len = 48_n;
-      nonnegative_int feature_size = 36_n;
+      positive_int batch_size = 40_p;
+      positive_int seq_len = 48_p;
+      positive_int feature_size = 36_p;
 
       DataType dtype = DataType::FLOAT;
       ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{
           TensorDims{
-              FFOrdered<nonnegative_int>{batch_size, seq_len, feature_size}},
+              FFOrdered<positive_int>{batch_size, seq_len, feature_size}},
           DataType::FLOAT,
       });
 
@@ -68,7 +68,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           make_1d_machine_view(
               MachineSpaceCoordinate{0_n, 0_n, DeviceType::GPU},
               MachineSpecificationDimension::INTRA_NODE,
-              stride_t{0_n}));
+              stride_t{1_p}));
 
       CHECK(result.total_elapsed_time > 0);
       CHECK(result.total_mem_usage > 0);
diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc
deleted file mode 100644
index e55d1eddf5..0000000000
--- a/lib/local-execution/test/src/test_local_slots_backing.cc
+++ /dev/null
@@ -1,276 +0,0 @@
-#include "kernels/attention_kernels.h"
-#include "kernels/local_cpu_allocator.h"
-#include "local-execution/local_cost_estimator.h"
-#include "local-execution/local_slots_backing.h"
-#include "op-attrs/ops/attention.h"
-#include "op-attrs/parallel_tensor_shape.h"
-#include "pcg/computation_graph.h"
-#include "pcg/computation_graph_builder.h"
-#include "test/utils/doctest/fmt/pair.h"
-#include "test/utils/doctest/fmt/unordered_map.h"
-#include "test/utils/doctest/fmt/variant.h"
-#include "test/utils/doctest/fmt/vector.h"
-#include "test_utils.h"
-#include <doctest/doctest.h>
-
-using namespace ::FlexFlow;
-
-TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("LocalSlotsBacking -- Attention Op") {
-    // allocate input memory
-    Allocator allocator = create_local_cpu_memory_allocator();
-    nonnegative_int embed_dim = 32_n;
-    nonnegative_int num_heads = 10_n;
-
-    nonnegative_int batch_size = 40_n;
-    nonnegative_int seq_len = 48_n;
-    nonnegative_int feature_size = 36_n;
-
-    DataType dtype = DataType::FLOAT;
-    TensorShape input_tensor_shape = TensorShape{
-        TensorDims{
-            FFOrdered<nonnegative_int>{batch_size, seq_len, feature_size}},
-        DataType::FLOAT,
-    };
-    TensorShape query_shape = input_tensor_shape;
-    TensorShape key_shape = input_tensor_shape;
-    TensorShape value_shape = input_tensor_shape;
-    GenericTensorAccessorW query = allocator.allocate_tensor(query_shape);
-    GenericTensorAccessorW key = allocator.allocate_tensor(key_shape);
-    GenericTensorAccessorW value = allocator.allocate_tensor(value_shape);
-
-    // build graph
-    ComputationGraphBuilder cg_builder;
-    tensor_guid_t query_guid =
-        cg_builder.create_input(query_shape, CreateGrad::YES);
-    tensor_guid_t key_guid =
-        cg_builder.create_input(key_shape, CreateGrad::YES);
-    tensor_guid_t value_guid =
-        cg_builder.create_input(value_shape, CreateGrad::YES);
-
-    std::string layer_name = "attn1";
-    tensor_guid_t output_guid =
-        cg_builder.multihead_attention(query_guid,
-                                       key_guid,
-                                       value_guid,
-                                       embed_dim,
-                                       num_heads,
-                                       /*kdim=*/embed_dim,
-                                       /*vdim=*/embed_dim,
-                                       /*dropout=*/0.0f,
-                                       /*bias=*/true,
-                                       /*add_bias_kv=*/false,
-                                       /*add_zero_attn=*/false,
-                                       /*initializer=*/std::nullopt,
-                                       /*maybe_name=*/layer_name);
-
-    layer_guid_t layer_guid =
-        get_layer_by_name(cg_builder.computation_graph, layer_name);
-
-    TensorBackingMap tensor_backing_map = {
-        {query_guid, query}, {key_guid, key}, {value_guid, value}};
-
-    // runtime arg config
-    ProfilingSettings settings = ProfilingSettings{/*warmup_iters=*/0,
-                                                   /*measure_iters=*/0};
-    PerDeviceFFHandle handle = get_mock_per_device_ff_handle();
-    RuntimeArgConfig runtime_arg_config =
-        RuntimeArgConfig{DeviceSpecific<PerDeviceFFHandle>::create(handle),
-                         EnableProfiling::NO,
-                         settings};
-
-    LocalSlotsBacking local_slots_backing = {tensor_backing_map,
-                                             runtime_arg_config};
-
-    SUBCASE("LocalSlotsBacking::allocate_outgoing_tensors") {
-      auto get_result_shape_and_dtype_for_tensor_guid_and_map =
-          [&](tensor_guid_t t,
-              TensorBackingMap m) -> std::pair<ArrayShape, DataType> {
-        GenericTensorAccessorW accessor = m.at(t);
-        return get_shape_and_datatype(accessor);
-      };
-
-      SUBCASE("Input (QKV) and gradient tensors allocation") {
-
-        // allocate all tensors from input nodes
-        for (layer_guid_t const &node :
-             topological_ordering(cg_builder.computation_graph)) {
-          if (node == layer_guid) {
-            break;
-          }
-          local_slots_backing.allocate_outgoing_tensors(
-              node, cg_builder.computation_graph, allocator);
-        }
-
-        SUBCASE("Query grad") {
-          std::pair<ArrayShape, DataType> result =
-              get_result_shape_and_dtype_for_tensor_guid_and_map(
-                  query_guid, local_slots_backing.gradient_tensor_mapping);
-          std::pair<ArrayShape, DataType> correct = {
-              array_shape_from_tensor_shape(query_shape), dtype};
-          CHECK(result == correct);
-        }
-        SUBCASE("Key grad") {
-          std::pair<ArrayShape, DataType> result =
-              get_result_shape_and_dtype_for_tensor_guid_and_map(
-                  key_guid, local_slots_backing.gradient_tensor_mapping);
-          std::pair<ArrayShape, DataType> correct = {
-              array_shape_from_tensor_shape(key_shape), dtype};
-          CHECK(result == correct);
-        }
-        SUBCASE("Value grad") {
-          std::pair<ArrayShape, DataType> result =
-              get_result_shape_and_dtype_for_tensor_guid_and_map(
-                  value_guid, local_slots_backing.gradient_tensor_mapping);
-          std::pair<ArrayShape, DataType> correct = {
-              array_shape_from_tensor_shape(value_shape), dtype};
-          CHECK(result == correct);
-        }
-      }
-      SUBCASE("Output and gradient tensors allocation") {
-        local_slots_backing.allocate_outgoing_tensors(
-            layer_guid, cg_builder.computation_graph, allocator);
-        SUBCASE("Output") {
-          std::pair<ArrayShape, DataType> result =
-              get_result_shape_and_dtype_for_tensor_guid_and_map(
-                  output_guid, local_slots_backing.tensor_mapping);
-          std::pair<ArrayShape, DataType> correct = {
-              array_shape_from_tensor_shape(
-                  get_tensor_attrs(cg_builder.computation_graph, output_guid)
-                      .shape),
-              dtype};
-          CHECK(result == correct);
-        }
-        SUBCASE("Output grad") {
-          std::pair<ArrayShape, DataType> result =
-              get_result_shape_and_dtype_for_tensor_guid_and_map(
-                  output_guid, local_slots_backing.gradient_tensor_mapping);
-          std::pair<ArrayShape, DataType> correct = {
-              array_shape_from_tensor_shape(
-                  get_tensor_attrs(cg_builder.computation_graph, output_guid)
-                      .shape),
-              dtype};
-          CHECK(result == correct);
-        }
-      }
-
-      SUBCASE("Tensor slots") {
-        local_slots_backing.allocate_outgoing_tensors(
-            layer_guid, cg_builder.computation_graph, allocator);
-        SUBCASE("Input tensor slots") {
-          std::vector<tensor_guid_t> correct_incoming_tensors =
-              get_incoming_tensors(cg_builder.computation_graph, layer_guid);
-          CHECK(correct_incoming_tensors ==
-                local_slots_backing.input_tensor_slots.at(layer_guid));
-        }
-        SUBCASE("Output tensor slots") {
-          std::vector<tensor_guid_t> correct_outgoing_tensors =
-              get_outgoing_tensors(cg_builder.computation_graph, layer_guid);
-          CHECK(correct_outgoing_tensors ==
-                local_slots_backing.output_tensor_slots.at(layer_guid));
-        }
-      }
-    }
-
-    SUBCASE("Construct Slots Backings") {
-      enum Slots {
-        QUERY,
-        KEY,
-        VALUE,
-        WEIGHTS,
-        OUTPUT,
-        QUERY_PARALLEL_TENSOR_SHAPE,
-        QPROJSIZE,
-        ATTRS,
-        PROFILING,
-        HANDLE,
-      };
-      MultiHeadAttentionAttrs attrs =
-          get_layer_attrs(cg_builder.computation_graph, layer_guid)
-              .op_attrs.get<MultiHeadAttentionAttrs>();
-      OpTaskBinding binding = [&] {
-        OpTaskBinding b;
-        b.bind(QUERY, input_tensor(0));
-        b.bind(KEY, input_tensor(1));
-        b.bind(VALUE, input_tensor(2));
-        b.bind(WEIGHTS, weight_tensor(3));
-        b.bind(OUTPUT, output_tensor(0));
-
-        b.bind_grad(QUERY, input_tensor(0));
-
-        b.bind_arg(QPROJSIZE, get_qProjSize(attrs));
-        b.bind_arg(ATTRS, attrs);
-        b.bind_arg(QUERY_PARALLEL_TENSOR_SHAPE, input_parallel_tensor_shape(0));
-        b.bind_arg(PROFILING, profiling_settings());
-        b.bind_arg(HANDLE, ff_handle());
-        return b;
-      }();
-
-      // allocate all incoming and outgoing tensors for graph
-      for (layer_guid_t const &node :
-           topological_ordering(cg_builder.computation_graph)) {
-        local_slots_backing.allocate_outgoing_tensors(
-            node, cg_builder.computation_graph, allocator);
-      }
-
-      SUBCASE("LocalSlotsBacking::construct_tensor_slots_backing") {
-        TensorSlotsBackingWithoutAddresses result =
-            get_slots_backing_without_tensor_allocation_addresses(
-                local_slots_backing.construct_tensor_slots_backing(binding,
-                                                                   layer_guid));
-        TensorSlotsBackingWithoutAddresses correct = [&] {
-          TensorShape weights_shape = throw_if_unexpected(
-              get_weights_shape(attrs, query_shape, key_shape, value_shape));
-          GenericTensorAccessorW weights =
-              allocator.allocate_tensor(weights_shape);
-
-          TensorAttrs output_attrs =
-              get_tensor_attrs(cg_builder.computation_graph, output_guid);
-          GenericTensorAccessorW output =
-              allocator.allocate_tensor(output_attrs.shape);
-          return get_slots_backing_without_tensor_allocation_addresses(
-              TensorSlotsBacking{
-                  {SlotGradId{slot_id_t{QUERY}, IsGrad::NO}, query},
-                  {SlotGradId{slot_id_t{KEY}, IsGrad::NO}, key},
-                  {SlotGradId{slot_id_t{VALUE}, IsGrad::NO}, value},
-                  {SlotGradId{slot_id_t{WEIGHTS}, IsGrad::NO}, weights},
-                  {SlotGradId{slot_id_t{OUTPUT}, IsGrad::NO}, output},
-                  {SlotGradId{slot_id_t{QUERY}, IsGrad::YES}, query}});
-        }();
-
-        CHECK(result == correct);
-      }
-      SUBCASE("LocalSlotsBacking::construct_arg_slots_backing") {
-        ArgSlotsBacking result =
-            local_slots_backing.construct_arg_slots_backing(binding,
-                                                            layer_guid);
-
-        ArgSlotsBacking correct = [&] {
-          ParallelTensorShape query_parallel_tensor_shape =
-              lift_to_parallel(query_shape);
-
-          return ArgSlotsBacking{
-              {slot_id_t{QPROJSIZE},
-               ConcreteArgSpec::create(get_qProjSize(attrs))},
-              {slot_id_t{ATTRS}, ConcreteArgSpec::create(attrs)},
-              {slot_id_t{QUERY_PARALLEL_TENSOR_SHAPE},
-               ConcreteArgSpec::create(query_parallel_tensor_shape)},
-              {slot_id_t{PROFILING},
-               ConcreteArgSpec::create(runtime_arg_config.profiling_settings)},
-              {slot_id_t{HANDLE}, ConcreteArgSpec::create(handle)}};
-        }();
-
-        CHECK(result == correct);
-      }
-
-      SUBCASE("LocalSlotsBacking::resolve_runtime_arg_ref_spec") {
-        RuntimeArgRefSpec ref_spec = RuntimeArgRefSpec::create(ff_handle());
-        ConcreteArgSpec arg_spec =
-            local_slots_backing.resolve_runtime_arg_ref_spec(ref_spec);
-
-        PerDeviceFFHandle result_handle = arg_spec.get<PerDeviceFFHandle>();
-        CHECK(result_handle == handle);
-      }
-    }
-  }
-}
diff --git a/lib/local-execution/test/src/test_local_task_arg_accessor.cc b/lib/local-execution/test/src/test_local_task_arg_accessor.cc
index 29b3b432cd..e817b6fd8e 100644
--- a/lib/local-execution/test/src/test_local_task_arg_accessor.cc
+++ b/lib/local-execution/test/src/test_local_task_arg_accessor.cc
@@ -9,17 +9,17 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("LocalTaskArgumentAccessor") {
     Allocator allocator = create_local_cpu_memory_allocator();
-    nonnegative_int embed_dim = 32_n;
-    nonnegative_int num_heads = 10_n;
+    positive_int embed_dim = 32_p;
+    positive_int num_heads = 10_p;
 
-    nonnegative_int batch_size = 40_n;
-    nonnegative_int seq_len = 48_n;
-    nonnegative_int feature_size = 36_n;
+    positive_int batch_size = 40_p;
+    positive_int seq_len = 48_p;
+    positive_int feature_size = 36_p;
 
     DataType dtype = DataType::FLOAT;
     TensorShape input_tensor_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{batch_size, seq_len, feature_size}},
+            FFOrdered{batch_size, seq_len, feature_size}},
         DataType::FLOAT,
     };
 
diff --git a/lib/local-execution/test/src/test_local_tensor_backing.cc b/lib/local-execution/test/src/test_local_tensor_backing.cc
index 594051c2f1..df787fcd6f 100644
--- a/lib/local-execution/test/src/test_local_tensor_backing.cc
+++ b/lib/local-execution/test/src/test_local_tensor_backing.cc
@@ -1,4 +1,4 @@
-#include "local-execution/local_cpu_allocator.h"
+#include "kernels/local_cpu_allocator.h"
 #include "local-execution/local_tensor_backing.h"
 #include "test_utils.h"
 #include "utils/containers/keys.h"
@@ -94,11 +94,11 @@ TEST_SUITE(FF_TEST_SUITE) {
           tensor_guid_source.new_mock_tensor_guid();
 
       TensorAttrs allocated_tensor_attrs = TensorAttrs{
-          TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 10_n}},
+          TensorShape{TensorDims{FFOrdered{16_p, 10_p}},
                       DataType::FLOAT},
           CreateGrad::NO};
       TensorAttrs unallocated_tensor_attrs = TensorAttrs{
-          TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 20_n}},
+          TensorShape{TensorDims{FFOrdered{16_p, 20_p}},
                       DataType::FLOAT},
           CreateGrad::YES};
 
diff --git a/lib/local-execution/test/src/test_loss_functions.cc b/lib/local-execution/test/src/test_loss_functions.cc
index e8f48413b6..5a9347e37b 100644
--- a/lib/local-execution/test/src/test_loss_functions.cc
+++ b/lib/local-execution/test/src/test_loss_functions.cc
@@ -13,7 +13,7 @@
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("LossFunctions") {
     // initialize runtime
     ManagedFFStream managed_stream{};
@@ -31,15 +31,15 @@ TEST_SUITE(FF_TEST_SUITE) {
     loss_tensor_t label_for_sparse_cce_loss_attrs =
         loss_tensor_source.new_loss_tensor();
 
-    nonnegative_int batch_size = 10_n;
-    nonnegative_int data_dim = 16_n;
-    nonnegative_int output_dim = 32_n;
+    positive_int batch_size = 10_p;
+    positive_int data_dim = 16_p;
+    positive_int output_dim = 32_p;
 
     TensorShape output_tensor_shape = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{batch_size, output_dim}},
+        TensorDims{FFOrdered{batch_size, output_dim}},
         DataType::FLOAT};
     TensorShape reduced_tensor_shape =
-        TensorShape{TensorDims{FFOrdered<nonnegative_int>{batch_size, 1_n}},
+        TensorShape{TensorDims{FFOrdered{batch_size, 1_p}},
                     DataType::FLOAT};
 
     GenericTensorAccessorW label_for_nonconfigurable_loss_attrs_backing =
@@ -58,11 +58,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     ComputationGraph computation_graph = make_empty_computation_graph();
 
     TensorShape input_tensor_shape = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{batch_size, data_dim}},
+        TensorDims{FFOrdered{batch_size, data_dim}},
         DataType::FLOAT};
 
     TensorShape weight_shape = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{data_dim, output_dim}},
+        TensorDims{FFOrdered{data_dim, output_dim}},
         DataType::FLOAT};
 
     LayerAddedResult inputs_layer =
diff --git a/lib/local-execution/test/src/test_task_registry.cc b/lib/local-execution/test/src/test_task_registry.cc
index c87fd3a899..ea20eb0fa0 100644
--- a/lib/local-execution/test/src/test_task_registry.cc
+++ b/lib/local-execution/test/src/test_task_registry.cc
@@ -12,8 +12,8 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("TaskRegistry") {
 
     layer_guid_t layer_guid = layer_guid_t{Node{0}};
-    nonnegative_int embed_dim = 32_n;
-    nonnegative_int num_heads = 10_n;
+    positive_int embed_dim = 32_p;
+    positive_int num_heads = 10_p;
     ComputationGraphOpAttrs attrs =
         ComputationGraphOpAttrs{MultiHeadAttentionAttrs{
             /*embed_dim=*/embed_dim,
@@ -80,7 +80,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
     SUBCASE("different attrs, still same task fn mapping") {
       layer_guid_t layer_1 = layer_guid_t{Node{1}};
-      nonnegative_int embed_dim = 100_n;
+      positive_int embed_dim = 100_p;
       layer_guid_t layer_2 = layer_guid_t{Node{2}};
       ComputationGraphOpAttrs other_attrs =
           ComputationGraphOpAttrs{MultiHeadAttentionAttrs{
@@ -112,7 +112,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("equality") {
       SUBCASE("different attrs is still equal") {
-        nonnegative_int embed_dim = 100_n;
+        positive_int embed_dim = 100_p;
         ComputationGraphOpAttrs other_attrs =
             ComputationGraphOpAttrs{MultiHeadAttentionAttrs{
                 /*embed_dim=*/embed_dim,
diff --git a/lib/local-execution/test/src/test_unallocated_tensors.cc b/lib/local-execution/test/src/test_unallocated_tensors.cc
index 82f5a132fe..7a2650b447 100644
--- a/lib/local-execution/test/src/test_unallocated_tensors.cc
+++ b/lib/local-execution/test/src/test_unallocated_tensors.cc
@@ -1,6 +1,6 @@
 #include "local-execution/allocated_tensors.h"
 #include "local-execution/gradient_tensor_source.h"
-#include "local-execution/local_cpu_allocator.h"
+#include "kernels/local_cpu_allocator.h"
 #include "local-execution/loss_tensor_source.h"
 #include "local-execution/optimizer_tensor_source.h"
 #include "local-execution/unallocated_tensors.h"
@@ -38,15 +38,15 @@ TEST_SUITE(FF_TEST_SUITE) {
         optimizer_tensor_source.new_optimizer_tensor();
 
     TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{
-        TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 10_n}},
+        TensorShape{TensorDims{FFOrdered{16_p, 10_p}},
                     DataType::FLOAT},
         CreateGrad::NO};
     TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{
-        TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 20_n}},
+        TensorShape{TensorDims{FFOrdered{16_p, 20_p}},
                     DataType::FLOAT},
         CreateGrad::NO};
     TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{
-        TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 30_n}},
+        TensorShape{TensorDims{FFOrdered{16_p, 30_p}},
                     DataType::FLOAT},
         CreateGrad::YES};
 
diff --git a/lib/local-execution/test/src/test_update.cc b/lib/local-execution/test/src/test_update.cc
index 18509d1fd9..6ffe002f22 100644
--- a/lib/local-execution/test/src/test_update.cc
+++ b/lib/local-execution/test/src/test_update.cc
@@ -11,7 +11,7 @@
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("ExecuteUpdate") {
     // initialize runtime configs
     ManagedFFStream managed_stream{};
@@ -26,16 +26,16 @@ TEST_SUITE(FF_TEST_SUITE) {
     // construct computation graph
     ComputationGraph computation_graph = make_empty_computation_graph();
 
-    nonnegative_int batch_size = 10_n;
-    nonnegative_int data_dim = 16_n;
-    nonnegative_int output_dim = 32_n;
+    positive_int batch_size = 10_p;
+    positive_int data_dim = 16_p;
+    positive_int output_dim = 32_p;
 
     TensorShape input_tensor_shape = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{batch_size, data_dim}},
+        TensorDims{FFOrdered{batch_size, data_dim}},
         DataType::FLOAT};
 
     TensorShape weight_shape = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{data_dim, output_dim}},
+        TensorDims{FFOrdered{data_dim, output_dim}},
         DataType::FLOAT};
 
     LayerAddedResult inputs_layer =
diff --git a/lib/models/include/models/bert/bert_config.struct.toml b/lib/models/include/models/bert/bert_config.struct.toml
index cc2a8eb0a7..de56a25710 100644
--- a/lib/models/include/models/bert/bert_config.struct.toml
+++ b/lib/models/include/models/bert/bert_config.struct.toml
@@ -12,28 +12,28 @@ features = [
 
 includes = [
   "op-attrs/activation.dtg.h",
-  "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 [[fields]]
 name = "vocab_size"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "hidden_size"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "num_encoder_layers"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "num_heads"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "dim_feedforward"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "hidden_act"
@@ -65,8 +65,8 @@ type = "float"
 
 [[fields]]
 name = "sequence_length"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "batch_size"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
diff --git a/lib/models/include/models/candle_uno/candle_uno_config.struct.toml b/lib/models/include/models/candle_uno/candle_uno_config.struct.toml
index e7d83efd07..135c58e1cc 100644
--- a/lib/models/include/models/candle_uno/candle_uno_config.struct.toml
+++ b/lib/models/include/models/candle_uno/candle_uno_config.struct.toml
@@ -14,7 +14,7 @@ includes = [
   "<vector>",
   "<map>",
   "<string>",
-  "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 src_includes = [
@@ -26,19 +26,19 @@ src_includes = [
 
 [[fields]]
 name = "batch_size"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "dense_layers"
-type = "std::vector<::FlexFlow::nonnegative_int>"
+type = "std::vector<::FlexFlow::positive_int>"
 
 [[fields]]
 name = "dense_feature_layers"
-type = "std::vector<::FlexFlow::nonnegative_int>"
+type = "std::vector<::FlexFlow::positive_int>"
 
 [[fields]]
 name = "feature_shapes"
-type = "std::map<std::string, ::FlexFlow::nonnegative_int>"
+type = "std::map<std::string, ::FlexFlow::positive_int>"
 
 [[fields]]
 name = "input_features"
diff --git a/lib/models/include/models/dlrm/dlrm_config.struct.toml b/lib/models/include/models/dlrm/dlrm_config.struct.toml
index 5f1c38faae..3cf43aed48 100644
--- a/lib/models/include/models/dlrm/dlrm_config.struct.toml
+++ b/lib/models/include/models/dlrm/dlrm_config.struct.toml
@@ -14,7 +14,7 @@ includes = [
   "<vector>",
   "<string>",
   "models/dlrm/dlrm_arch_interaction_op.dtg.h",
-  "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 src_includes = [
@@ -24,23 +24,23 @@ src_includes = [
 
 [[fields]]
 name = "embedding_dim"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "embedding_bag_size"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "embedding_size"
-type = "std::vector<::FlexFlow::nonnegative_int>"
+type = "std::vector<::FlexFlow::positive_int>"
 
 [[fields]]
 name = "dense_arch_layer_sizes"
-type = "std::vector<::FlexFlow::nonnegative_int>"
+type = "std::vector<::FlexFlow::positive_int>"
 
 [[fields]]
 name = "over_arch_layer_sizes"
-type = "std::vector<::FlexFlow::nonnegative_int>"
+type = "std::vector<::FlexFlow::positive_int>"
 
 [[fields]]
 name = "arch_interaction_op"
@@ -48,7 +48,7 @@ type = "::FlexFlow::DLRMArchInteractionOp"
 
 [[fields]]
 name = "batch_size"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "seed"
diff --git a/lib/models/include/models/inception_v3/inception_v3_config.struct.toml b/lib/models/include/models/inception_v3/inception_v3_config.struct.toml
index 1290420e16..0075783c87 100644
--- a/lib/models/include/models/inception_v3/inception_v3_config.struct.toml
+++ b/lib/models/include/models/inception_v3/inception_v3_config.struct.toml
@@ -11,16 +11,16 @@ features = [
 ]
 
 includes = [
-  "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 [[fields]]
 name = "num_classes"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "batch_size"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "aux_logits"
diff --git a/lib/models/include/models/split_test/split_test.h b/lib/models/include/models/split_test/split_test.h
index dd7089c4f6..d5de538b8b 100644
--- a/lib/models/include/models/split_test/split_test.h
+++ b/lib/models/include/models/split_test/split_test.h
@@ -12,7 +12,7 @@ namespace FlexFlow {
  * @note This is a tiny model developed for testing the original Unity
  * implementation. It is not a "real" model and has never been trained.
  */
-ComputationGraph get_split_test_computation_graph(nonnegative_int batch_size);
+ComputationGraph get_split_test_computation_graph(positive_int batch_size);
 
 } // namespace FlexFlow
 
diff --git a/lib/models/include/models/transformer/transformer_config.struct.toml b/lib/models/include/models/transformer/transformer_config.struct.toml
index 2a0b39feb9..686491eff4 100644
--- a/lib/models/include/models/transformer/transformer_config.struct.toml
+++ b/lib/models/include/models/transformer/transformer_config.struct.toml
@@ -10,36 +10,36 @@ features = [
 ]
 
 includes = [
-  "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 [[fields]]
 name = "num_features"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "sequence_length"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "batch_size"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "dim_feedforward"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "num_heads"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "num_encoder_layers"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "num_decoder_layers"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "dropout"
@@ -51,4 +51,4 @@ type = "float"
 
 [[fields]]
 name = "vocab_size"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
diff --git a/lib/models/src/models/bert/bert.cc b/lib/models/src/models/bert/bert.cc
index 535e03e413..bfcab8ffbf 100644
--- a/lib/models/src/models/bert/bert.cc
+++ b/lib/models/src/models/bert/bert.cc
@@ -7,11 +7,11 @@ namespace FlexFlow {
 
 BertConfig get_default_bert_config() {
   return BertConfig{
-      /*vocab_size=*/30522_n,
-      /*hidden_size=*/768_n,
-      /*num_encoder_layers=*/12_n,
-      /*num_heads=*/12_n,
-      /*dim_feedforward=*/3072_n,
+      /*vocab_size=*/30522_p,
+      /*hidden_size=*/768_p,
+      /*num_encoder_layers=*/12_p,
+      /*num_heads=*/12_p,
+      /*dim_feedforward=*/3072_p,
       /*hidden_act=*/Activation::GELU,
       /*hidden_dropout_prob=*/0.1,
       /*attention_probs_dropout_prob=*/0.1,
@@ -19,8 +19,8 @@ BertConfig get_default_bert_config() {
       /*layer_norm_eps=*/1e-12,
       /*position_embedding_type=*/"absolute",
       /*classifier_dropout=*/0.1,
-      /*sequence_length=*/512_n,
-      /*batch_size=*/64_n,
+      /*sequence_length=*/512_p,
+      /*batch_size=*/64_p,
   };
 }
 
@@ -60,8 +60,8 @@ tensor_guid_t
   assert(num_dims(cgb.get_shape(input)) == 3);
   std::vector<relative_ff_dim_t> layer_norm_axis = {
       relative_ff_dim_t{-1}}; // Apply layernorm across the last dim
-  nonnegative_int kdim = config.dim_feedforward / config.num_heads;
-  nonnegative_int vdim = config.dim_feedforward / config.num_heads;
+  positive_int kdim = positive_int{config.dim_feedforward / config.num_heads};
+  positive_int vdim = positive_int{config.dim_feedforward / config.num_heads};
   tensor_guid_t self_attention =
       cgb.multihead_attention(input,
                               input,
@@ -130,7 +130,7 @@ ComputationGraph get_bert_computation_graph(BertConfig const &config) {
   InitializerAttrs bias_initializer = InitializerAttrs{ZeroInitializerAttrs{}};
 
   TensorShape input_shape = TensorShape{
-      TensorDims{FFOrdered<nonnegative_int>{
+      TensorDims{FFOrdered<positive_int>{
           config.batch_size, config.sequence_length, config.hidden_size}},
       DataType::FLOAT,
   };
@@ -152,7 +152,7 @@ ComputationGraph get_bert_computation_graph(BertConfig const &config) {
   assert(
       (cgb.get_shape(out_prob) ==
        TensorShape{
-           TensorDims{FFOrdered<nonnegative_int>{
+           TensorDims{FFOrdered<positive_int>{
                config.batch_size, config.sequence_length, config.vocab_size}},
            DataType::FLOAT,
        }));
diff --git a/lib/models/src/models/candle_uno/candle_uno.cc b/lib/models/src/models/candle_uno/candle_uno.cc
index 3d06b03348..8bbbccdbaf 100644
--- a/lib/models/src/models/candle_uno/candle_uno.cc
+++ b/lib/models/src/models/candle_uno/candle_uno.cc
@@ -6,16 +6,16 @@ namespace FlexFlow {
 
 CandleUnoConfig get_default_candle_uno_config() {
   return CandleUnoConfig{
-      /*batch_size=*/64_n,
-      /*dense_layers=*/repeat_element(/*num_times=*/4_n, /*element=*/4192_n),
+      /*batch_size=*/64_p,
+      /*dense_layers=*/repeat_element(/*num_times=*/4_n, /*element=*/4192_p),
       /*dense_feature_layers=*/
-      repeat_element(/*num_times=*/8_n, /*element=*/4192_n),
+      repeat_element(/*num_times=*/8_n, /*element=*/4192_p),
       /*feature_shapes=*/
       {
-          {"dose", 1_n},
-          {"cell.rnaseq", 942_n},
-          {"drug.descriptors", 5270_n},
-          {"drug.fingerprints", 2048_n},
+          {"dose", 1_p},
+          {"cell.rnaseq", 942_p},
+          {"drug.descriptors", 5270_p},
+          {"drug.fingerprints", 2048_p},
       },
       /*input_features=*/
       {
@@ -37,7 +37,7 @@ tensor_guid_t create_candle_uno_feature_model(
     tensor_guid_t const &input,
     InitializerAttrs const &kernel_initializer) {
   tensor_guid_t t = input;
-  for (nonnegative_int dense_dim : config.dense_feature_layers) {
+  for (positive_int dense_dim : config.dense_feature_layers) {
     t = cgb.dense(t,
                   dense_dim,
                   Activation::RELU,
@@ -58,7 +58,7 @@ ComputationGraph
       InitializerAttrs{GlorotNormalAttrs{/*seed=*/0}};
 
   auto create_input_tensor =
-      [&](FFOrdered<nonnegative_int> const &dims) -> tensor_guid_t {
+      [&](FFOrdered<positive_int> const &dims) -> tensor_guid_t {
     TensorShape input_shape = TensorShape{
         TensorDims{dims},
         DataType::FLOAT,
@@ -84,7 +84,7 @@ ComputationGraph
 
   for (auto const &input_feature : config.input_features) {
     std::string const &feature_name = input_feature.second;
-    nonnegative_int shape = config.feature_shapes.at(feature_name);
+    positive_int shape = config.feature_shapes.at(feature_name);
     tensor_guid_t input = create_input_tensor({config.batch_size, shape});
     all_inputs.push_back(input);
 
@@ -98,7 +98,7 @@ ComputationGraph
 
   tensor_guid_t output =
       cgb.concat(encoded_inputs, /*axis=*/relative_ff_dim_t{1});
-  for (nonnegative_int dense_layer_dim : config.dense_layers) {
+  for (positive_int dense_layer_dim : config.dense_layers) {
     tensor_guid_t residual_input = output;
     output = cgb.dense(output,
                        dense_layer_dim,
@@ -114,7 +114,7 @@ ComputationGraph
     }
   }
   output = cgb.dense(output,
-                     /*outDim=*/1_n,
+                     /*outDim=*/1_p,
                      /*activation=*/std::nullopt,
                      /*use_bias=*/false,
                      /*data_type=*/DataType::FLOAT,
diff --git a/lib/models/src/models/dlrm/dlrm.cc b/lib/models/src/models/dlrm/dlrm.cc
index 718e709352..5d56909fec 100644
--- a/lib/models/src/models/dlrm/dlrm.cc
+++ b/lib/models/src/models/dlrm/dlrm.cc
@@ -10,37 +10,37 @@ namespace FlexFlow {
 
 DLRMConfig get_default_dlrm_config() {
   return DLRMConfig{
-      /*embedding_dim=*/64_n,
-      /*embedding_bag_size=*/1_n,
+      /*embedding_dim=*/64_p,
+      /*embedding_bag_size=*/1_p,
       /*embedding_size=*/
-      std::vector<nonnegative_int>{
-          1000000_n,
-          1000000_n,
-          1000000_n,
-          1000000_n,
+      std::vector<positive_int>{
+          1000000_p,
+          1000000_p,
+          1000000_p,
+          1000000_p,
       },
       /*dense_arch_layer_sizes=*/
-      std::vector<nonnegative_int>{
-          4_n,
-          64_n,
-          64_n,
+      std::vector<positive_int>{
+          4_p,
+          64_p,
+          64_p,
       },
       /*over_arch_layer_sizes=*/
-      std::vector<nonnegative_int>{
-          64_n,
-          64_n,
-          2_n,
+      std::vector<positive_int>{
+          64_p,
+          64_p,
+          2_p,
       },
       /*arch_interaction_op=*/DLRMArchInteractionOp::CAT,
-      /*batch_size=*/64_n,
-      /*seed=*/std::rand(),
+      /*batch_size=*/64_p,
+      /*seed=*/0,
   };
 }
 
 tensor_guid_t create_dlrm_mlp(ComputationGraphBuilder &cgb,
                               DLRMConfig const &config,
                               tensor_guid_t const &input,
-                              std::vector<nonnegative_int> const &mlp_layers) {
+                              std::vector<positive_int> const &mlp_layers) {
   tensor_guid_t t = input;
 
   // Refer to
@@ -76,8 +76,8 @@ tensor_guid_t create_dlrm_mlp(ComputationGraphBuilder &cgb,
 tensor_guid_t create_dlrm_sparse_embedding_network(ComputationGraphBuilder &cgb,
                                                    DLRMConfig const &config,
                                                    tensor_guid_t const &input,
-                                                   nonnegative_int input_dim,
-                                                   nonnegative_int output_dim) {
+                                                   positive_int input_dim,
+                                                   positive_int output_dim) {
   float range = sqrt(1.0f / input_dim);
   InitializerAttrs embed_initializer = InitializerAttrs{UniformInitializerAttrs{
       /*seed=*/config.seed,
@@ -116,7 +116,7 @@ tensor_guid_t create_dlrm_interact_features(
 ComputationGraph get_dlrm_computation_graph(DLRMConfig const &config) {
   ComputationGraphBuilder cgb;
 
-  auto create_input_tensor = [&](FFOrdered<nonnegative_int> const &dims,
+  auto create_input_tensor = [&](FFOrdered<positive_int> const &dims,
                                  DataType const &data_type) -> tensor_guid_t {
     TensorShape input_shape = TensorShape{
         TensorDims{dims},
@@ -145,7 +145,7 @@ ComputationGraph get_dlrm_computation_graph(DLRMConfig const &config) {
 
   std::vector<tensor_guid_t> emb_outputs = transform(
       zip(config.embedding_size, sparse_inputs),
-      [&](std::pair<nonnegative_int, tensor_guid_t> const &combined_pair)
+      [&](std::pair<positive_int, tensor_guid_t> const &combined_pair)
           -> tensor_guid_t {
         return create_dlrm_sparse_embedding_network(
             /*cgb=*/cgb,
diff --git a/lib/models/src/models/inception_v3/inception_v3.cc b/lib/models/src/models/inception_v3/inception_v3.cc
index 3a829f3754..82aa445f17 100644
--- a/lib/models/src/models/inception_v3/inception_v3.cc
+++ b/lib/models/src/models/inception_v3/inception_v3.cc
@@ -16,12 +16,12 @@ struct CheckShape {
   InceptionV3Config const &config;
 
   void operator()(tensor_guid_t t,
-                  nonnegative_int c,
-                  nonnegative_int h,
-                  nonnegative_int w) const {
+                  positive_int c,
+                  positive_int h,
+                  positive_int w) const {
     TensorShape current_shape = cgb.get_shape(t);
     TensorShape expected_shape = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{
+        TensorDims{FFOrdered<positive_int>{
             config.batch_size,
             c,
             h,
@@ -38,10 +38,10 @@ struct CheckShape {
     }
   }
 
-  void operator()(tensor_guid_t t, nonnegative_int c) const {
+  void operator()(tensor_guid_t t, positive_int c) const {
     TensorShape current_shape = cgb.get_shape(t);
     TensorShape expected_shape = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{
+        TensorDims{FFOrdered<positive_int>{
             config.batch_size,
             c,
         }},
@@ -59,11 +59,11 @@ struct CheckShape {
 
 InceptionV3Config get_default_inception_v3_training_config() {
   return InceptionV3Config{
-      /*num_classes=*/1000_n,
+      /*num_classes=*/1000_p,
 
       // see section 8 of https://arxiv.org/abs/1512.00567 for the source of the
       // batch size
-      /*batch_size=*/32_n,
+      /*batch_size=*/32_p,
 
       // see section 4 of https://arxiv.org/abs/1512.00567 for a discussion of
       // auxiliary logits. they are used by default in training
@@ -73,11 +73,11 @@ InceptionV3Config get_default_inception_v3_training_config() {
 
 static tensor_guid_t create_conv_block(ComputationGraphBuilder &cgb,
                                        tensor_guid_t const &input,
-                                       nonnegative_int filters,
-                                       nonnegative_int kernel_size_h,
-                                       nonnegative_int kernel_size_w,
-                                       nonnegative_int stride_h = 1_n,
-                                       nonnegative_int stride_w = 1_n,
+                                       positive_int filters,
+                                       positive_int kernel_size_h,
+                                       positive_int kernel_size_w,
+                                       positive_int stride_h = 1_p,
+                                       positive_int stride_w = 1_p,
                                        nonnegative_int padding_h = 0_n,
                                        nonnegative_int padding_w = 0_n,
                                        bool use_bias = false) {
@@ -90,7 +90,7 @@ static tensor_guid_t create_conv_block(ComputationGraphBuilder &cgb,
                                   /*paddingH=*/padding_h,
                                   /*paddingW=*/padding_w,
                                   /*activation=*/std::nullopt,
-                                  /*groups=*/1_n,
+                                  /*groups=*/1_p,
                                   /*use_bias=*/use_bias);
   return cgb.batch_norm(conv,
                         /*affine=*/true,
@@ -101,27 +101,27 @@ static tensor_guid_t create_conv_block(ComputationGraphBuilder &cgb,
 
 static tensor_guid_t create_inception_module_a(ComputationGraphBuilder &cgb,
                                                tensor_guid_t const &input,
-                                               nonnegative_int pool_features) {
+                                               positive_int pool_features) {
   tensor_guid_t branch1x1 = create_conv_block(cgb,
                                               input,
-                                              /*filters=*/64_n,
-                                              /*kernel_size_h=*/1_n,
-                                              /*kernel_size_w=*/1_n);
+                                              /*filters=*/64_p,
+                                              /*kernel_size_h=*/1_p,
+                                              /*kernel_size_w=*/1_p);
 
   tensor_guid_t branch5x5 = [&] {
     tensor_guid_t t = input;
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/48_n,
-                          /*kernel_size_h=*/1_n,
-                          /*kernel_size_w=*/1_n);
+                          /*filters=*/48_p,
+                          /*kernel_size_h=*/1_p,
+                          /*kernel_size_w=*/1_p);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/64_n,
-                          /*kernel_size_h=*/5_n,
-                          /*kernel_size_w=*/5_n,
-                          /*stride_h=*/1_n,
-                          /*stride_w=*/1_n,
+                          /*filters=*/64_p,
+                          /*kernel_size_h=*/5_p,
+                          /*kernel_size_w=*/5_p,
+                          /*stride_h=*/1_p,
+                          /*stride_w=*/1_p,
                           /*padding_h=*/2_n,
                           /*padding_w=*/2_n);
     return t;
@@ -131,25 +131,25 @@ static tensor_guid_t create_inception_module_a(ComputationGraphBuilder &cgb,
     tensor_guid_t t = input;
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/64_n,
-                          /*kernel_size_h=*/1_n,
-                          /*kernel_size_w=*/1_n);
+                          /*filters=*/64_p,
+                          /*kernel_size_h=*/1_p,
+                          /*kernel_size_w=*/1_p);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/96_n,
-                          /*kernel_size_h=*/3_n,
-                          /*kernel_size_w=*/3_n,
-                          /*stride_h=*/1_n,
-                          /*stride_w=*/1_n,
+                          /*filters=*/96_p,
+                          /*kernel_size_h=*/3_p,
+                          /*kernel_size_w=*/3_p,
+                          /*stride_h=*/1_p,
+                          /*stride_w=*/1_p,
                           /*padding_h=*/1_n,
                           /*padding_w=*/1_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/96_n,
-                          /*kernel_size_h=*/3_n,
-                          /*kernel_size_w=*/3_n,
-                          /*stride_h=*/1_n,
-                          /*stride_w=*/1_n,
+                          /*filters=*/96_p,
+                          /*kernel_size_h=*/3_p,
+                          /*kernel_size_w=*/3_p,
+                          /*stride_h=*/1_p,
+                          /*stride_w=*/1_p,
                           /*padding_h=*/1_n,
                           /*padding_w=*/1_n);
     return t;
@@ -158,18 +158,18 @@ static tensor_guid_t create_inception_module_a(ComputationGraphBuilder &cgb,
   tensor_guid_t branch_pool = [&] {
     tensor_guid_t t = input;
     t = cgb.pool2d(t,
-                   /*kernelH=*/3_n,
-                   /*kernelW=*/3_n,
-                   /*strideH=*/1_n,
-                   /*strideW=*/1_n,
+                   /*kernelH=*/3_p,
+                   /*kernelW=*/3_p,
+                   /*strideH=*/1_p,
+                   /*strideW=*/1_p,
                    /*paddingH=*/1_n,
                    /*paddingW=*/1_n,
                    /*type=*/PoolOp::AVG);
     t = create_conv_block(cgb,
                           t,
                           /*filters=*/pool_features,
-                          /*kernel_stride_h=*/1_n,
-                          /*kernel_stride_w=*/1_n);
+                          /*kernel_stride_h=*/1_p,
+                          /*kernel_stride_w=*/1_p);
     return t;
   }();
 
@@ -181,43 +181,43 @@ static tensor_guid_t create_inception_module_b(ComputationGraphBuilder &cgb,
                                                tensor_guid_t const &input) {
   tensor_guid_t branch3x3 = create_conv_block(cgb,
                                               input,
-                                              /*filters=*/384_n,
-                                              /*kernel_size_h=*/3_n,
-                                              /*kernel_size_w=*/3_n,
-                                              /*stride_h=*/2_n,
-                                              /*stride_w=*/2_n);
+                                              /*filters=*/384_p,
+                                              /*kernel_size_h=*/3_p,
+                                              /*kernel_size_w=*/3_p,
+                                              /*stride_h=*/2_p,
+                                              /*stride_w=*/2_p);
 
   tensor_guid_t branch3x3dbl = [&] {
     tensor_guid_t t = input;
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/64_n,
-                          /*kernel_size_h=*/1_n,
-                          /*kernel_size_w=*/1_n);
+                          /*filters=*/64_p,
+                          /*kernel_size_h=*/1_p,
+                          /*kernel_size_w=*/1_p);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/96_n,
-                          /*kernel_size_h=*/3_n,
-                          /*kernel_size_w=*/3_n,
-                          /*stride_h=*/1_n,
-                          /*stride_w=*/1_n,
+                          /*filters=*/96_p,
+                          /*kernel_size_h=*/3_p,
+                          /*kernel_size_w=*/3_p,
+                          /*stride_h=*/1_p,
+                          /*stride_w=*/1_p,
                           /*padding_h=*/1_n,
                           /*padding_w=*/1_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/96_n,
-                          /*kernel_stride_h=*/3_n,
-                          /*kernel_stride_w=*/3_n,
-                          /*stride_h=*/2_n,
-                          /*stride_w=*/2_n);
+                          /*filters=*/96_p,
+                          /*kernel_stride_h=*/3_p,
+                          /*kernel_stride_w=*/3_p,
+                          /*stride_h=*/2_p,
+                          /*stride_w=*/2_p);
     return t;
   }();
 
   tensor_guid_t branch_pool = cgb.pool2d(input,
-                                         /*kernelH=*/3_n,
-                                         /*kernelW=*/3_n,
-                                         /*strideH=*/2_n,
-                                         /*strideW=*/2_n,
+                                         /*kernelH=*/3_p,
+                                         /*kernelW=*/3_p,
+                                         /*strideH=*/2_p,
+                                         /*strideW=*/2_p,
                                          /*paddingH=*/0_n,
                                          /*paddingW=*/0_n,
                                          /*type=*/PoolOp::MAX);
@@ -229,108 +229,108 @@ static tensor_guid_t create_inception_module_b(ComputationGraphBuilder &cgb,
 static tensor_guid_t create_inception_module_c(ComputationGraphBuilder &cgb,
                                                CheckShape const &check_shape,
                                                tensor_guid_t const &input,
-                                               nonnegative_int channels_7x7) {
+                                               positive_int channels_7x7) {
   tensor_guid_t branch1x1 = create_conv_block(cgb,
                                               input,
-                                              /*filters=*/192_n,
-                                              /*kernel_size_h=*/1_n,
-                                              /*kernel_size_w=*/1_n);
-  check_shape(branch1x1, 192_n, 17_n, 17_n);
+                                              /*filters=*/192_p,
+                                              /*kernel_size_h=*/1_p,
+                                              /*kernel_size_w=*/1_p);
+  check_shape(branch1x1, 192_p, 17_p, 17_p);
 
   tensor_guid_t branch7x7 = [&] {
     tensor_guid_t t = input;
     t = create_conv_block(cgb,
                           t,
                           /*filters=*/channels_7x7,
-                          /*kernel_size_h=*/1_n,
-                          /*kernel_size_w=*/1_n);
+                          /*kernel_size_h=*/1_p,
+                          /*kernel_size_w=*/1_p);
     t = create_conv_block(cgb,
                           t,
                           /*filters=*/channels_7x7,
-                          /*kernel_size_h=*/1_n,
-                          /*kernel_size_w=*/7_n,
-                          /*stride_h=*/1_n,
-                          /*stride_w=*/1_n,
+                          /*kernel_size_h=*/1_p,
+                          /*kernel_size_w=*/7_p,
+                          /*stride_h=*/1_p,
+                          /*stride_w=*/1_p,
                           /*padding_h=*/0_n,
                           /*padding_w=*/3_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/192_n,
-                          /*kernel_size_h=*/7_n,
-                          /*kernel_size_w=*/1_n,
-                          /*stride_h=*/1_n,
-                          /*stride_w=*/1_n,
+                          /*filters=*/192_p,
+                          /*kernel_size_h=*/7_p,
+                          /*kernel_size_w=*/1_p,
+                          /*stride_h=*/1_p,
+                          /*stride_w=*/1_p,
                           /*padding_h=*/3_n,
                           /*padding_w=*/0_n);
     return t;
   }();
-  check_shape(branch7x7, 192_n, 17_n, 17_n);
+  check_shape(branch7x7, 192_p, 17_p, 17_p);
 
   tensor_guid_t branch7x7dbl = [&] {
     tensor_guid_t t = input;
     t = create_conv_block(cgb,
                           t,
                           /*filters=*/channels_7x7,
-                          /*kernel_size_h=*/1_n,
-                          /*kernel_size_w=*/1_n);
+                          /*kernel_size_h=*/1_p,
+                          /*kernel_size_w=*/1_p);
     t = create_conv_block(cgb,
                           t,
                           /*filters=*/channels_7x7,
-                          /*kernel_size_h=*/7_n,
-                          /*kernel_size_w=*/1_n,
-                          /*stride_h=*/1_n,
-                          /*stride_w=*/1_n,
+                          /*kernel_size_h=*/7_p,
+                          /*kernel_size_w=*/1_p,
+                          /*stride_h=*/1_p,
+                          /*stride_w=*/1_p,
                           /*padding_h=*/3_n,
                           /*padding_w=*/0_n);
     t = create_conv_block(cgb,
                           t,
                           /*filters=*/channels_7x7,
-                          /*kernel_size_h=*/1_n,
-                          /*kernel_size_w=*/7_n,
-                          /*stride_h=*/1_n,
-                          /*stride_w=*/1_n,
+                          /*kernel_size_h=*/1_p,
+                          /*kernel_size_w=*/7_p,
+                          /*stride_h=*/1_p,
+                          /*stride_w=*/1_p,
                           /*padding_h=*/0_n,
                           /*padding_w=*/3_n);
     t = create_conv_block(cgb,
                           t,
                           /*filters=*/channels_7x7,
-                          /*kernel_size_h=*/7_n,
-                          /*kernel_size_w=*/1_n,
-                          /*stride_h=*/1_n,
-                          /*stride_w=*/1_n,
+                          /*kernel_size_h=*/7_p,
+                          /*kernel_size_w=*/1_p,
+                          /*stride_h=*/1_p,
+                          /*stride_w=*/1_p,
                           /*padding_h=*/3_n,
                           /*padding_w=*/0_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/192_n,
-                          /*kernel_size_h=*/1_n,
-                          /*kernel_size_w=*/7_n,
-                          /*stride_h=*/1_n,
-                          /*stride_w=*/1_n,
+                          /*filters=*/192_p,
+                          /*kernel_size_h=*/1_p,
+                          /*kernel_size_w=*/7_p,
+                          /*stride_h=*/1_p,
+                          /*stride_w=*/1_p,
                           /*padding_h=*/0_n,
                           /*padding_w=*/3_n);
     return t;
   }();
-  check_shape(branch7x7dbl, 192_n, 17_n, 17_n);
+  check_shape(branch7x7dbl, 192_p, 17_p, 17_p);
 
   tensor_guid_t branch_pool = [&] {
     tensor_guid_t t = input;
     t = cgb.pool2d(t,
-                   /*kernelH=*/3_n,
-                   /*kernelW=*/3_n,
-                   /*strideH=*/1_n,
-                   /*strideW=*/1_n,
+                   /*kernelH=*/3_p,
+                   /*kernelW=*/3_p,
+                   /*strideH=*/1_p,
+                   /*strideW=*/1_p,
                    /*paddingH=*/1_n,
                    /*paddingW=*/1_n,
                    /*type=*/PoolOp::AVG);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/192_n,
-                          /*kernel_size_h=*/1_n,
-                          /*kernel_size_w=*/1_n);
+                          /*filters=*/192_p,
+                          /*kernel_size_h=*/1_p,
+                          /*kernel_size_w=*/1_p);
     return t;
   }();
-  check_shape(branch_pool, 192_n, 17_n, 17_n);
+  check_shape(branch_pool, 192_p, 17_p, 17_p);
 
   return cgb.concat({branch1x1, branch7x7, branch7x7dbl, branch_pool},
                     /*axis=*/relative_ff_dim_t{1});
@@ -342,10 +342,10 @@ static tensor_guid_t create_inception_module_d(ComputationGraphBuilder &cgb,
     tensor_guid_t t = input;
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/192_n,
-                          /*kernel_size_h=*/1_n,
-                          /*kernel_size_w=*/1_n);
-    t = create_conv_block(cgb, t, 320_n, 3_n, 3_n, 2_n, 2_n);
+                          /*filters=*/192_p,
+                          /*kernel_size_h=*/1_p,
+                          /*kernel_size_w=*/1_p);
+    t = create_conv_block(cgb, t, 320_p, 3_p, 3_p, 2_p, 2_p);
     return t;
   }();
 
@@ -353,42 +353,42 @@ static tensor_guid_t create_inception_module_d(ComputationGraphBuilder &cgb,
     tensor_guid_t t = input;
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/192_n,
-                          /*kernel_size_h=*/1_n,
-                          /*kernel_size_w=*/1_n);
+                          /*filters=*/192_p,
+                          /*kernel_size_h=*/1_p,
+                          /*kernel_size_w=*/1_p);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/192_n,
-                          /*kernel_size_h=*/1_n,
-                          /*kernel_size_w=*/7_n,
-                          /*stride_h=*/1_n,
-                          /*stride_w=*/1_n,
+                          /*filters=*/192_p,
+                          /*kernel_size_h=*/1_p,
+                          /*kernel_size_w=*/7_p,
+                          /*stride_h=*/1_p,
+                          /*stride_w=*/1_p,
                           /*padding_h=*/0_n,
                           /*padding_w=*/3_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/192_n,
-                          /*kernel_size_h=*/7_n,
-                          /*kernel_size_w=*/1_n,
-                          /*stride_h=*/1_n,
-                          /*stride_w=*/1_n,
+                          /*filters=*/192_p,
+                          /*kernel_size_h=*/7_p,
+                          /*kernel_size_w=*/1_p,
+                          /*stride_h=*/1_p,
+                          /*stride_w=*/1_p,
                           /*padding_h=*/3_n,
                           /*padding_w=*/0_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/192_n,
-                          /*kernel_size_h=*/3_n,
-                          /*kernel_size_w=*/3_n,
-                          /*stride_h=*/2_n,
-                          /*stride_w=*/2_n);
+                          /*filters=*/192_p,
+                          /*kernel_size_h=*/3_p,
+                          /*kernel_size_w=*/3_p,
+                          /*stride_h=*/2_p,
+                          /*stride_w=*/2_p);
     return t;
   }();
 
   tensor_guid_t branch_pool = cgb.pool2d(input,
-                                         /*kernelH=*/3_n,
-                                         /*kernelW=*/3_n,
-                                         /*strideH=*/2_n,
-                                         /*strideW=*/2_n,
+                                         /*kernelH=*/3_p,
+                                         /*kernelW=*/3_p,
+                                         /*strideH=*/2_p,
+                                         /*strideW=*/2_p,
                                          /*paddingH=*/0_n,
                                          /*paddingW=*/0_n,
                                          /*type=*/PoolOp::MAX);
@@ -401,33 +401,33 @@ static tensor_guid_t create_inception_module_e(ComputationGraphBuilder &cgb,
                                                tensor_guid_t const &input) {
   tensor_guid_t branch1x1 = create_conv_block(cgb,
                                               input,
-                                              /*filters=*/320_n,
-                                              /*kernel_size_h=*/1_n,
-                                              /*kernel_size_w=*/1_n);
+                                              /*filters=*/320_p,
+                                              /*kernel_size_h=*/1_p,
+                                              /*kernel_size_w=*/1_p);
 
   tensor_guid_t branch3x3 = [&] {
     tensor_guid_t t = input;
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/384_n,
-                          /*kernel_size_h=*/1_n,
-                          /*kernel_size_w=*/1_n);
+                          /*filters=*/384_p,
+                          /*kernel_size_h=*/1_p,
+                          /*kernel_size_w=*/1_p);
     tensor_guid_t t_1 = create_conv_block(cgb,
                                           t,
-                                          /*filters=*/384_n,
-                                          /*kernel_size_h=*/1_n,
-                                          /*kernel_size_w=*/3_n,
-                                          /*stride_h=*/1_n,
-                                          /*stride_w=*/1_n,
+                                          /*filters=*/384_p,
+                                          /*kernel_size_h=*/1_p,
+                                          /*kernel_size_w=*/3_p,
+                                          /*stride_h=*/1_p,
+                                          /*stride_w=*/1_p,
                                           /*padding_h=*/0_n,
                                           /*padding_w=*/1_n);
     tensor_guid_t t_2 = create_conv_block(cgb,
                                           t,
-                                          /*filters=*/384_n,
-                                          /*kernel_size_h=*/3_n,
-                                          /*kernel_size_w=*/1_n,
-                                          /*stride_h=*/1_n,
-                                          /*stride_w=*/1_n,
+                                          /*filters=*/384_p,
+                                          /*kernel_size_h=*/3_p,
+                                          /*kernel_size_w=*/1_p,
+                                          /*stride_h=*/1_p,
+                                          /*stride_w=*/1_p,
                                           /*padding_h=*/1_n,
                                           /*padding_w=*/0_n);
     t = cgb.concat({t_1, t_2}, /*axis=*/relative_ff_dim_t{1});
@@ -438,34 +438,34 @@ static tensor_guid_t create_inception_module_e(ComputationGraphBuilder &cgb,
     tensor_guid_t t = input;
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/448_n,
-                          /*kernel_size_h=*/1_n,
-                          /*kernel_size_w=*/1_n);
+                          /*filters=*/448_p,
+                          /*kernel_size_h=*/1_p,
+                          /*kernel_size_w=*/1_p);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/384_n,
-                          /*kernel_size_h=*/3_n,
-                          /*kernel_size_w=*/3_n,
-                          /*stride_h=*/1_n,
-                          /*stride_w=*/1_n,
+                          /*filters=*/384_p,
+                          /*kernel_size_h=*/3_p,
+                          /*kernel_size_w=*/3_p,
+                          /*stride_h=*/1_p,
+                          /*stride_w=*/1_p,
                           /*padding_h=*/1_n,
                           /*padding_w=*/1_n);
     tensor_guid_t t_1 = create_conv_block(cgb,
                                           t,
-                                          /*filters=*/384_n,
-                                          /*kernel_size_h=*/1_n,
-                                          /*kernel_size_w=*/3_n,
-                                          /*stride_h=*/1_n,
-                                          /*stride_w=*/1_n,
+                                          /*filters=*/384_p,
+                                          /*kernel_size_h=*/1_p,
+                                          /*kernel_size_w=*/3_p,
+                                          /*stride_h=*/1_p,
+                                          /*stride_w=*/1_p,
                                           /*padding_h=*/0_n,
                                           /*padding_w=*/1_n);
     tensor_guid_t t_2 = create_conv_block(cgb,
                                           t,
-                                          /*filters=*/384_n,
-                                          /*kernel_size_h=*/3_n,
-                                          /*kernel_size_w=*/1_n,
-                                          /*stride_h=*/1_n,
-                                          /*stride_w=*/1_n,
+                                          /*filters=*/384_p,
+                                          /*kernel_size_h=*/3_p,
+                                          /*kernel_size_w=*/1_p,
+                                          /*stride_h=*/1_p,
+                                          /*stride_w=*/1_p,
                                           /*padding_h=*/1_n,
                                           /*padding_w=*/0_n);
     t = cgb.concat({t_1, t_2}, /*axis=*/relative_ff_dim_t{1});
@@ -475,18 +475,18 @@ static tensor_guid_t create_inception_module_e(ComputationGraphBuilder &cgb,
   tensor_guid_t branch_pool = [&] {
     tensor_guid_t t = input;
     t = cgb.pool2d(t,
-                   /*kernelH=*/3_n,
-                   /*kernelW=*/3_n,
-                   /*strideH=*/1_n,
-                   /*strideW=*/1_n,
+                   /*kernelH=*/3_p,
+                   /*kernelW=*/3_p,
+                   /*strideH=*/1_p,
+                   /*strideW=*/1_p,
                    /*paddingH=*/1_n,
                    /*paddingW=*/1_n,
                    /*type=*/PoolOp::AVG);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/192_n,
-                          /*kernel_size_h=*/1_n,
-                          /*kernel_size_w=*/1_n);
+                          /*filters=*/192_p,
+                          /*kernel_size_h=*/1_p,
+                          /*kernel_size_w=*/1_p);
     return t;
   }();
 
@@ -499,75 +499,75 @@ static tensor_guid_t create_initial_layers(ComputationGraphBuilder &cgb,
                                            tensor_guid_t const &input) {
   tensor_guid_t t = input;
 
-  check_shape(t, 3_n, 299_n, 299_n);
+  check_shape(t, 3_p, 299_p, 299_p);
 
   // Conv2d_1a_3x3
   t = create_conv_block(cgb,
                         t,
-                        /*filters=*/32_n,
-                        /*kernel_size_h=*/3_n,
-                        /*kernel_size_w=*/3_n,
-                        /*stride_h=*/2_n,
-                        /*stride_w=*/2_n);
-  check_shape(t, 32_n, 149_n, 149_n);
+                        /*filters=*/32_p,
+                        /*kernel_size_h=*/3_p,
+                        /*kernel_size_w=*/3_p,
+                        /*stride_h=*/2_p,
+                        /*stride_w=*/2_p);
+  check_shape(t, 32_p, 149_p, 149_p);
 
   // Conv2d_2a_3x3
   t = create_conv_block(cgb,
                         t,
-                        /*filters=*/32_n,
-                        /*kernel_size_h=*/3_n,
-                        /*kernel_size_w=*/3_n);
-  check_shape(t, 32_n, 147_n, 147_n);
+                        /*filters=*/32_p,
+                        /*kernel_size_h=*/3_p,
+                        /*kernel_size_w=*/3_p);
+  check_shape(t, 32_p, 147_p, 147_p);
 
   // Conv2d_2b_3x3
   t = create_conv_block(cgb,
                         t,
-                        /*filters=*/64_n,
-                        /*kernel_size_h=*/3_n,
-                        /*kernel_size_w=*/3_n,
-                        /*stride_h=*/1_n,
-                        /*stride_w=*/1_n,
+                        /*filters=*/64_p,
+                        /*kernel_size_h=*/3_p,
+                        /*kernel_size_w=*/3_p,
+                        /*stride_h=*/1_p,
+                        /*stride_w=*/1_p,
                         /*padding_h=*/1_n,
                         /*padding_w=*/1_n);
-  check_shape(t, 64_n, 147_n, 147_n);
+  check_shape(t, 64_p, 147_p, 147_p);
 
   // maxpool1
   t = cgb.pool2d(t,
-                 /*kernelH=*/3_n,
-                 /*kernelW=*/3_n,
-                 /*strideH=*/2_n,
-                 /*strideW=*/2_n,
+                 /*kernelH=*/3_p,
+                 /*kernelW=*/3_p,
+                 /*strideH=*/2_p,
+                 /*strideW=*/2_p,
                  /*paddingH=*/0_n,
                  /*paddingW=*/0_n,
                  /*type=*/PoolOp::MAX);
-  check_shape(t, 64_n, 73_n, 73_n);
+  check_shape(t, 64_p, 73_p, 73_p);
 
   // Conv2d_3b_1x1
   t = create_conv_block(cgb,
                         t,
-                        /*filters=*/80_n,
-                        /*kernel_size_h=*/1_n,
-                        /*kernel_size_w=*/1_n);
-  check_shape(t, 80_n, 73_n, 73_n);
+                        /*filters=*/80_p,
+                        /*kernel_size_h=*/1_p,
+                        /*kernel_size_w=*/1_p);
+  check_shape(t, 80_p, 73_p, 73_p);
 
   // Conv2d_4a_3x3
   t = create_conv_block(cgb,
                         t,
-                        /*filters=*/192_n,
-                        /*kernel_size_h=*/3_n,
-                        /*kernel_size_w=*/3_n);
-  check_shape(t, 192_n, 71_n, 71_n);
+                        /*filters=*/192_p,
+                        /*kernel_size_h=*/3_p,
+                        /*kernel_size_w=*/3_p);
+  check_shape(t, 192_p, 71_p, 71_p);
 
   // maxpool2
   t = cgb.pool2d(t,
-                 /*kernelH=*/3_n,
-                 /*kernelW=*/3_n,
-                 /*strideH=*/2_n,
-                 /*strideW=*/2_n,
+                 /*kernelH=*/3_p,
+                 /*kernelW=*/3_p,
+                 /*strideH=*/2_p,
+                 /*strideW=*/2_p,
                  /*paddingH=*/0_n,
                  /*paddingW=*/0_n,
                  /*type=*/PoolOp::MAX);
-  check_shape(t, 192_n, 35_n, 35_n);
+  check_shape(t, 192_p, 35_p, 35_p);
 
   return t;
 }
@@ -575,26 +575,26 @@ static tensor_guid_t create_initial_layers(ComputationGraphBuilder &cgb,
 static tensor_guid_t create_final_layers(ComputationGraphBuilder &cgb,
                                          CheckShape const &check_shape,
                                          tensor_guid_t const &input,
-                                         nonnegative_int num_classes) {
+                                         positive_int num_classes) {
   // avgpool
   tensor_guid_t x = cgb.pool2d(input,
-                               /*kernelH=*/8_n,
-                               /*kernelW=*/8_n,
-                               /*strideH=*/1_n,
-                               /*strideW=*/1_n,
+                               /*kernelH=*/8_p,
+                               /*kernelW=*/8_p,
+                               /*strideH=*/1_p,
+                               /*strideW=*/1_p,
                                /*paddingH=*/0_n,
                                /*paddingW=*/0_n,
                                /*type=*/PoolOp::AVG);
-  check_shape(x, 2048_n, 1_n, 1_n);
+  check_shape(x, 2048_p, 1_p, 1_p);
 
   // dropout
   x = cgb.dropout(x,
                   /*rate=*/0.5);
-  check_shape(x, 2048_n, 1_n, 1_n);
+  check_shape(x, 2048_p, 1_p, 1_p);
 
   x = cgb.flat(x,
                /*start_dim=*/relative_ff_dim_t{1});
-  check_shape(x, 2048_n);
+  check_shape(x, 2048_p);
 
   // fc
   x = cgb.dense(x,
@@ -602,7 +602,7 @@ static tensor_guid_t create_final_layers(ComputationGraphBuilder &cgb,
   check_shape(x, num_classes);
 
   // softmax (not in pytorch model, but shown in Table 1 on p6 of
-  // https://arxiv.org/abs/1512.00567_n)
+  // https://arxiv.org/abs/1512.00567)
   x = cgb.softmax(x);
   check_shape(x, num_classes);
 
@@ -612,44 +612,44 @@ static tensor_guid_t create_final_layers(ComputationGraphBuilder &cgb,
 static tensor_guid_t create_inception_aux(ComputationGraphBuilder &cgb,
                                           CheckShape const &check_shape,
                                           tensor_guid_t const &input,
-                                          nonnegative_int num_classes) {
+                                          positive_int num_classes) {
   tensor_guid_t x = input;
-  check_shape(x, 768_n, 17_n, 17_n);
+  check_shape(x, 768_p, 17_p, 17_p);
 
   x = cgb.pool2d(x,
-                 /*kernelH=*/5_n,
-                 /*kernelW=*/5_n,
-                 /*strideH=*/3_n,
-                 /*strideW=*/3_n,
+                 /*kernelH=*/5_p,
+                 /*kernelW=*/5_p,
+                 /*strideH=*/3_p,
+                 /*strideW=*/3_p,
                  /*paddingH=*/0_n,
                  /*paddingW=*/0_n,
                  /*type=*/PoolOp::AVG);
-  check_shape(x, 768_n, 5_n, 5_n);
+  check_shape(x, 768_p, 5_p, 5_p);
 
   // conv0
   x = create_conv_block(cgb,
                         x,
-                        /*filters=*/128_n,
-                        /*kernel_size_h=*/1_n,
-                        /*kernel_size_w=*/1_n);
-  check_shape(x, 128_n, 5_n, 5_n);
+                        /*filters=*/128_p,
+                        /*kernel_size_h=*/1_p,
+                        /*kernel_size_w=*/1_p);
+  check_shape(x, 128_p, 5_p, 5_p);
 
   // conv1
   x = create_conv_block(cgb,
                         x,
-                        /*filters=*/768_n,
-                        /*kernel_size_h=*/5_n,
-                        /*kernel_size_w=*/5_n);
-  check_shape(x, 768_n, 1_n, 1_n);
+                        /*filters=*/768_p,
+                        /*kernel_size_h=*/5_p,
+                        /*kernel_size_w=*/5_p);
+  check_shape(x, 768_p, 1_p, 1_p);
 
   x = cgb.adaptive_pool2d(x,
-                          /*output_h=*/1_n,
-                          /*output_w=*/1_n);
-  check_shape(x, 768_n, 1_n, 1_n);
+                          /*output_h=*/1_p,
+                          /*output_w=*/1_p);
+  check_shape(x, 768_p, 1_p, 1_p);
 
   x = cgb.flat(x,
                /*start_dim=*/relative_ff_dim_t{1});
-  check_shape(x, 768_n);
+  check_shape(x, 768_p);
 
   // fc
   x = cgb.dense(x,
@@ -671,39 +671,39 @@ static InceptionV3Output create_inception_v3(ComputationGraphBuilder &cgb,
   };
 
   tensor_guid_t x = create_initial_layers(cgb, check_shape, input);
-  check_shape(x, 192_n, 35_n, 35_n);
+  check_shape(x, 192_p, 35_p, 35_p);
 
   // Mixed_5b
-  x = create_inception_module_a(cgb, x, 32_n);
-  check_shape(x, 256_n, 35_n, 35_n);
+  x = create_inception_module_a(cgb, x, 32_p);
+  check_shape(x, 256_p, 35_p, 35_p);
 
   // Mixed_5c
-  x = create_inception_module_a(cgb, x, 64_n);
-  check_shape(x, 288_n, 35_n, 35_n);
+  x = create_inception_module_a(cgb, x, 64_p);
+  check_shape(x, 288_p, 35_p, 35_p);
 
   // Mixed_5d
-  x = create_inception_module_a(cgb, x, 64_n);
-  check_shape(x, 288_n, 35_n, 35_n);
+  x = create_inception_module_a(cgb, x, 64_p);
+  check_shape(x, 288_p, 35_p, 35_p);
 
   // Mixed_6a
   x = create_inception_module_b(cgb, x);
-  check_shape(x, 768_n, 17_n, 17_n);
+  check_shape(x, 768_p, 17_p, 17_p);
 
   // Mixed_6b
-  x = create_inception_module_c(cgb, check_shape, x, 128_n);
-  check_shape(x, 768_n, 17_n, 17_n);
+  x = create_inception_module_c(cgb, check_shape, x, 128_p);
+  check_shape(x, 768_p, 17_p, 17_p);
 
   // Mixed_6c
-  x = create_inception_module_c(cgb, check_shape, x, 160_n);
-  check_shape(x, 768_n, 17_n, 17_n);
+  x = create_inception_module_c(cgb, check_shape, x, 160_p);
+  check_shape(x, 768_p, 17_p, 17_p);
 
   // Mixed_6d
-  x = create_inception_module_c(cgb, check_shape, x, 160_n);
-  check_shape(x, 768_n, 17_n, 17_n);
+  x = create_inception_module_c(cgb, check_shape, x, 160_p);
+  check_shape(x, 768_p, 17_p, 17_p);
 
   // Mixed_6e
-  x = create_inception_module_c(cgb, check_shape, x, 192_n);
-  check_shape(x, 768_n, 17_n, 17_n);
+  x = create_inception_module_c(cgb, check_shape, x, 192_p);
+  check_shape(x, 768_p, 17_p, 17_p);
 
   std::optional<tensor_guid_t> aux;
   if (config.aux_logits) {
@@ -713,15 +713,15 @@ static InceptionV3Output create_inception_v3(ComputationGraphBuilder &cgb,
 
   // Mixed_7a
   x = create_inception_module_d(cgb, x);
-  check_shape(x, 1280_n, 8_n, 8_n);
+  check_shape(x, 1280_p, 8_p, 8_p);
 
   // Mixed_7b
   x = create_inception_module_e(cgb, x);
-  check_shape(x, 2048_n, 8_n, 8_n);
+  check_shape(x, 2048_p, 8_p, 8_p);
 
   // Mixed_7c
   x = create_inception_module_e(cgb, x);
-  check_shape(x, 2048_n, 8_n, 8_n);
+  check_shape(x, 2048_p, 8_p, 8_p);
 
   x = create_final_layers(cgb, check_shape, x, config.num_classes);
   check_shape(x, config.num_classes);
@@ -737,11 +737,11 @@ ComputationGraph
   ComputationGraphBuilder cgb;
 
   TensorShape input_shape = TensorShape{
-      TensorDims{FFOrdered<nonnegative_int>{
+      TensorDims{FFOrdered<positive_int>{
           config.batch_size,
-          3_n,
-          299_n,
-          299_n,
+          3_p,
+          299_p,
+          299_p,
       }},
       DataType::FLOAT,
   };
diff --git a/lib/models/src/models/split_test/split_test.cc b/lib/models/src/models/split_test/split_test.cc
index d3876d8bfc..67d2f74ce0 100644
--- a/lib/models/src/models/split_test/split_test.cc
+++ b/lib/models/src/models/split_test/split_test.cc
@@ -4,16 +4,16 @@
 
 namespace FlexFlow {
 
-ComputationGraph get_split_test_computation_graph(nonnegative_int batch_size) {
+ComputationGraph get_split_test_computation_graph(positive_int batch_size) {
   ComputationGraphBuilder cgb;
 
-  nonnegative_int layer_dim1 = 256_n;
-  nonnegative_int layer_dim2 = 128_n;
-  nonnegative_int layer_dim3 = 64_n;
-  nonnegative_int layer_dim4 = 32_n;
+  positive_int layer_dim1 = 256_p;
+  positive_int layer_dim2 = 128_p;
+  positive_int layer_dim3 = 64_p;
+  positive_int layer_dim4 = 32_p;
 
   TensorShape input_shape = TensorShape{
-      TensorDims{FFOrdered<nonnegative_int>{
+      TensorDims{FFOrdered<positive_int>{
           batch_size,
           layer_dim1,
       }},
diff --git a/lib/models/src/models/transformer/transformer.cc b/lib/models/src/models/transformer/transformer.cc
index f71763313a..dfc40a5720 100644
--- a/lib/models/src/models/transformer/transformer.cc
+++ b/lib/models/src/models/transformer/transformer.cc
@@ -4,16 +4,16 @@
 namespace FlexFlow {
 
 TransformerConfig get_default_transformer_config() {
-  return TransformerConfig{/*num_features=*/512_n,
-                           /*sequence_length=*/512_n,
-                           /*batch_size=*/64_n,
-                           /*dim_feedforward=*/2048_n,
-                           /*num_heads=*/8_n,
-                           /*num_encoder_layers=*/6_n,
-                           /*num_decoder_layers=*/6_n,
+  return TransformerConfig{/*num_features=*/512_p,
+                           /*sequence_length=*/512_p,
+                           /*batch_size=*/64_p,
+                           /*dim_feedforward=*/2048_p,
+                           /*num_heads=*/8_p,
+                           /*num_encoder_layers=*/6_p,
+                           /*num_decoder_layers=*/6_p,
                            /*dropout=*/0.1,
                            /*layer_norm_eps=*/1e-05,
-                           /*vocab_size=*/64_n};
+                           /*vocab_size=*/64_p};
 }
 
 tensor_guid_t create_feedforward_network(ComputationGraphBuilder &cgb,
@@ -34,8 +34,8 @@ tensor_guid_t create_transformer_encoder_layer(ComputationGraphBuilder &cgb,
                                                tensor_guid_t const &input) {
   std::vector<relative_ff_dim_t> layer_norm_axis = {
       relative_ff_dim_t{-1}}; // Normalize the last dim
-  nonnegative_int kdim = config.dim_feedforward / config.num_heads;
-  nonnegative_int vdim = config.dim_feedforward / config.num_heads;
+  positive_int kdim = positive_int{config.dim_feedforward / config.num_heads};
+  positive_int vdim = positive_int{config.dim_feedforward / config.num_heads};
   tensor_guid_t self_attention =
       cgb.multihead_attention(/*query=*/input,
                               /*key=*/input,
@@ -83,8 +83,8 @@ tensor_guid_t
                                      tensor_guid_t const &encoder_output) {
   std::vector<relative_ff_dim_t> layer_norm_axis = {
       relative_ff_dim_t{-1}}; // Normalize the last dim
-  nonnegative_int kdim = config.dim_feedforward / config.num_heads;
-  nonnegative_int vdim = config.dim_feedforward / config.num_heads;
+  positive_int kdim = positive_int{config.dim_feedforward / config.num_heads};
+  positive_int vdim = positive_int{config.dim_feedforward / config.num_heads};
   tensor_guid_t self_attention =
       cgb.multihead_attention(/*query=*/input,
                               /*key=*/input,
@@ -153,7 +153,7 @@ ComputationGraph
   ComputationGraphBuilder cgb;
 
   TensorShape input_shape = TensorShape{
-      TensorDims{FFOrdered<nonnegative_int>{
+      TensorDims{FFOrdered<positive_int>{
           config.batch_size, config.sequence_length, config.num_features}},
       DataType::FLOAT,
   };
diff --git a/lib/op-attrs/include/op-attrs/datatype.h b/lib/op-attrs/include/op-attrs/datatype.h
index 9996e36482..62f7ccd4f9 100644
--- a/lib/op-attrs/include/op-attrs/datatype.h
+++ b/lib/op-attrs/include/op-attrs/datatype.h
@@ -4,7 +4,7 @@
 #include "op-attrs/datatype.dtg.h"
 #include "utils/fmt.h"
 #include "utils/fp16.h"
-#include "utils/nonnegative_int/nonnegative_int.h"
+#include "utils/positive_int/positive_int.h"
 #include <variant>
 
 namespace FlexFlow {
@@ -40,7 +40,7 @@ template <typename T>
 struct type_to_data_type_enum;
 
 template <>
-struct type_to_data_type_enum<DataType>
+struct type_to_data_type_enum<double>
   : std::integral_constant<DataType, DataType::DOUBLE> {};
 
 template <>
@@ -74,7 +74,7 @@ typename data_type_enum_to_class<DT>::type cast_to(T t) {
 template <DataType DT>
 using real_type_t = typename data_type_enum_to_class<DT>::type;
 
-nonnegative_int size_of_datatype(DataType);
+positive_int size_of_datatype(DataType);
 
 /**
  * @brief Maximally semantics-preserving casts, not including identity
diff --git a/lib/op-attrs/include/op-attrs/datatype_value.h b/lib/op-attrs/include/op-attrs/datatype_value.h
index 723e69bddd..b646692de9 100644
--- a/lib/op-attrs/include/op-attrs/datatype_value.h
+++ b/lib/op-attrs/include/op-attrs/datatype_value.h
@@ -1,6 +1,7 @@
 #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H
 
+#include "op-attrs/datatype.dtg.h"
 #include "op-attrs/datatype_value.dtg.h"
 
 namespace FlexFlow {
@@ -11,6 +12,8 @@ DataTypeValue make_int32_data_type_value(int32_t value);
 DataTypeValue make_int64_data_type_value(int64_t value);
 DataTypeValue make_bool_data_type_value(bool value);
 
+DataType get_data_type_of_data_type_value(DataTypeValue);
+
 } // namespace FlexFlow
 
 #endif // _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H
diff --git a/lib/op-attrs/include/op-attrs/initializers/kaiming_initializer_mode.h b/lib/op-attrs/include/op-attrs/initializers/kaiming_initializer_mode.h
index bd95ff677c..f619f94e20 100644
--- a/lib/op-attrs/include/op-attrs/initializers/kaiming_initializer_mode.h
+++ b/lib/op-attrs/include/op-attrs/initializers/kaiming_initializer_mode.h
@@ -3,7 +3,7 @@
 
 #include "op-attrs/initializers/kaiming_initializer_mode.dtg.h"
 #include "op-attrs/tensor_dims.dtg.h"
-#include "utils/nonnegative_int/nonnegative_int.h"
+#include "utils/positive_int/positive_int.h"
 
 namespace FlexFlow {
 
@@ -13,7 +13,7 @@ namespace FlexFlow {
  * see
  * https://github.com/pytorch/pytorch/blob/bd019c0bb485904a99fb38589444b1461ab1e486/torch/nn/init.py#L345-L363
  */
-nonnegative_int calculate_fan_for_mode(TensorDims const &dims,
+positive_int calculate_fan_for_mode(TensorDims const &dims,
                                        KaimingInitializerMode mode);
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/include/op-attrs/ops/attention.h b/lib/op-attrs/include/op-attrs/ops/attention.h
index fa57a717e2..5ca237561f 100644
--- a/lib/op-attrs/include/op-attrs/ops/attention.h
+++ b/lib/op-attrs/include/op-attrs/ops/attention.h
@@ -13,31 +13,31 @@
 
 namespace FlexFlow {
 
-nonnegative_int get_qProjSize(MultiHeadAttentionAttrs const &);
-nonnegative_int get_vProjSize(MultiHeadAttentionAttrs const &);
-nonnegative_int get_kProjSize(MultiHeadAttentionAttrs const &);
-nonnegative_int get_oProjSize(MultiHeadAttentionAttrs const &);
+positive_int get_qProjSize(MultiHeadAttentionAttrs const &);
+positive_int get_vProjSize(MultiHeadAttentionAttrs const &);
+positive_int get_kProjSize(MultiHeadAttentionAttrs const &);
+positive_int get_oProjSize(MultiHeadAttentionAttrs const &);
 
-nonnegative_int get_qSize(MultiHeadAttentionParallelInputs const &);
-nonnegative_int get_qSize(MultiHeadAttentionInputs const &);
+positive_int get_qSize(MultiHeadAttentionParallelInputs const &);
+positive_int get_qSize(MultiHeadAttentionInputs const &);
 
-nonnegative_int get_kSize(MultiHeadAttentionParallelInputs const &);
-nonnegative_int get_kSize(MultiHeadAttentionInputs const &);
+positive_int get_kSize(MultiHeadAttentionParallelInputs const &);
+positive_int get_kSize(MultiHeadAttentionInputs const &);
 
-nonnegative_int get_vSize(MultiHeadAttentionParallelInputs const &);
-nonnegative_int get_vSize(MultiHeadAttentionInputs const &);
+positive_int get_vSize(MultiHeadAttentionParallelInputs const &);
+positive_int get_vSize(MultiHeadAttentionInputs const &);
 
-nonnegative_int get_oSize(ParallelTensorShape const &);
-nonnegative_int get_oSize(TensorShape const &);
+positive_int get_oSize(ParallelTensorShape const &);
+positive_int get_oSize(TensorShape const &);
 
-nonnegative_int get_qoSeqLength(MultiHeadAttentionParallelInputs const &);
-nonnegative_int get_qoSeqLength(MultiHeadAttentionInputs const &);
+positive_int get_qoSeqLength(MultiHeadAttentionParallelInputs const &);
+positive_int get_qoSeqLength(MultiHeadAttentionInputs const &);
 
-nonnegative_int get_kvSeqLength(MultiHeadAttentionParallelInputs const &);
-nonnegative_int get_kvSeqLength(MultiHeadAttentionInputs const &);
+positive_int get_kvSeqLength(MultiHeadAttentionParallelInputs const &);
+positive_int get_kvSeqLength(MultiHeadAttentionInputs const &);
 
-nonnegative_int get_num_samples(MultiHeadAttentionParallelInputs const &);
-nonnegative_int get_num_samples(MultiHeadAttentionInputs const &);
+positive_int get_num_samples(MultiHeadAttentionParallelInputs const &);
+positive_int get_num_samples(MultiHeadAttentionInputs const &);
 
 std::vector<IncomingTensorRole>
     get_attention_incoming_tensor_roles(MultiHeadAttentionAttrs const &);
diff --git a/lib/op-attrs/include/op-attrs/ops/attention/multihead_attention_inputs.struct.toml b/lib/op-attrs/include/op-attrs/ops/attention/multihead_attention_inputs.struct.toml
index f85b7268af..8b9aefb67e 100644
--- a/lib/op-attrs/include/op-attrs/ops/attention/multihead_attention_inputs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/attention/multihead_attention_inputs.struct.toml
@@ -11,28 +11,28 @@ features = [
 
 includes = [
   "op-attrs/datatype.dtg.h",
-  "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 [[fields]]
 name = "batch_size"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "sequence_length"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "query_size"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "key_size"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "value_size"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "datatype"
diff --git a/lib/op-attrs/include/op-attrs/ops/attention_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/attention_attrs.struct.toml
index 019131b07c..b9c6847cd6 100644
--- a/lib/op-attrs/include/op-attrs/ops/attention_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/attention_attrs.struct.toml
@@ -11,24 +11,24 @@ features = [
 ]
 
 includes = [
-  "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 [[fields]]
 name = "embed_dim"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "num_heads"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "kdim"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "vdim"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "dropout"
diff --git a/lib/op-attrs/include/op-attrs/ops/combine_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/combine_attrs.struct.toml
index b3c574264c..d80f853b00 100644
--- a/lib/op-attrs/include/op-attrs/ops/combine_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/combine_attrs.struct.toml
@@ -12,7 +12,7 @@ features = [
 includes = [
   "op-attrs/ff_dim_t.h",
   "op-attrs/ff_dim_t.dtg.h",
-  "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 [[fields]]
@@ -21,4 +21,4 @@ type = "::FlexFlow::ff_dim_t"
 
 [[fields]]
 name = "combine_degree"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
diff --git a/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_input_shape.struct.toml b/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_input_shape.struct.toml
index c4fb74ebd8..b81acbfadd 100644
--- a/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_input_shape.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_input_shape.struct.toml
@@ -12,24 +12,24 @@ features = [
 includes = [
   "<cstddef>",
   "op-attrs/datatype.dtg.h",
-  "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 [[fields]]
 name = "num_samples"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "num_channels"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "height"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "width"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "datatype"
diff --git a/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_parallel_input_shape.struct.toml b/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_parallel_input_shape.struct.toml
index fdf0eaca78..668c61168b 100644
--- a/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_parallel_input_shape.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_parallel_input_shape.struct.toml
@@ -12,7 +12,7 @@ features = [
 includes = [
   "op-attrs/shard_parallel_dim.dtg.h",
   "op-attrs/datatype.dtg.h",
-  "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 [[fields]]
@@ -33,11 +33,11 @@ type = "::FlexFlow::ShardParallelDim"
 
 [[fields]]
 name = "sum_reduction_degree"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "discard_copy_reduction_degree"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "datatype"
diff --git a/lib/op-attrs/include/op-attrs/ops/conv_2d_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/conv_2d_attrs.struct.toml
index 8b86d42e04..469ce6570e 100644
--- a/lib/op-attrs/include/op-attrs/ops/conv_2d_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/conv_2d_attrs.struct.toml
@@ -13,6 +13,7 @@ includes = [
   "<optional>",
   "op-attrs/activation.dtg.h",
   "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 src_includes = [
@@ -22,14 +23,14 @@ src_includes = [
 ]
 
 fields = [
-  { name = "out_channels", type = "::FlexFlow::nonnegative_int" },
-  { name = "kernel_h", type = "::FlexFlow::nonnegative_int" },
-  { name = "kernel_w", type = "::FlexFlow::nonnegative_int" },
-  { name = "stride_h", type = "::FlexFlow::nonnegative_int" },
-  { name = "stride_w", type = "::FlexFlow::nonnegative_int" },
+  { name = "out_channels", type = "::FlexFlow::positive_int" },
+  { name = "kernel_h", type = "::FlexFlow::positive_int" },
+  { name = "kernel_w", type = "::FlexFlow::positive_int" },
+  { name = "stride_h", type = "::FlexFlow::positive_int" },
+  { name = "stride_w", type = "::FlexFlow::positive_int" },
   { name = "padding_h", type = "::FlexFlow::nonnegative_int" },
   { name = "padding_w", type = "::FlexFlow::nonnegative_int" },
-  { name = "groups", type = "::FlexFlow::nonnegative_int" },
+  { name = "groups", type = "::FlexFlow::positive_int" },
   { name = "activation", type = "std::optional<::FlexFlow::Activation>" },
   { name = "use_bias", type = "bool" },
 ]
diff --git a/lib/op-attrs/include/op-attrs/ops/embedding_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/embedding_attrs.struct.toml
index 5a857efb3e..07f82883db 100644
--- a/lib/op-attrs/include/op-attrs/ops/embedding_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/embedding_attrs.struct.toml
@@ -12,7 +12,7 @@ features = [
 includes = [
   "op-attrs/aggregate_op.dtg.h",
   "op-attrs/datatype.dtg.h",
-  "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
   "<optional>",
 ]
 
@@ -24,11 +24,11 @@ src_includes = [
 
 [[fields]]
 name = "num_entries"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "out_channels"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "aggr"
diff --git a/lib/op-attrs/include/op-attrs/ops/linear_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/linear_attrs.struct.toml
index ffbe93c975..23513482d3 100644
--- a/lib/op-attrs/include/op-attrs/ops/linear_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/linear_attrs.struct.toml
@@ -14,7 +14,7 @@ includes = [
   "op-attrs/activation.dtg.h",
   "op-attrs/regularizer_attrs.dtg.h",
   "<optional>",
-  "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 src_includes = [
@@ -25,7 +25,7 @@ src_includes = [
 
 [[fields]]
 name = "out_channels"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "use_bias"
diff --git a/lib/op-attrs/include/op-attrs/ops/pool_2d.h b/lib/op-attrs/include/op-attrs/ops/pool_2d.h
index af11d61f07..368250c957 100644
--- a/lib/op-attrs/include/op-attrs/ops/pool_2d.h
+++ b/lib/op-attrs/include/op-attrs/ops/pool_2d.h
@@ -13,8 +13,8 @@ CHECK_VALID_OP_ATTR(Pool2DAttrs);
 
 tl::expected<Pool2DAttrs, std::string>
     make_adaptive_pool2d_attrs(TensorDims const &input_dims,
-                               nonnegative_int output_h,
-                               nonnegative_int output_w,
+                               positive_int output_h,
+                               positive_int output_w,
                                PoolOp pool_type,
                                std::optional<Activation> const &activation);
 
diff --git a/lib/op-attrs/include/op-attrs/ops/pool_2d_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/pool_2d_attrs.struct.toml
index fea318d46d..d0005eee19 100644
--- a/lib/op-attrs/include/op-attrs/ops/pool_2d_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/pool_2d_attrs.struct.toml
@@ -14,6 +14,7 @@ includes = [
   "op-attrs/activation.dtg.h",
   "<optional>",
   "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 src_includes = [
@@ -24,19 +25,19 @@ src_includes = [
 
 [[fields]]
 name = "kernel_h"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "kernel_w"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "stride_h"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "stride_w"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "padding_h"
diff --git a/lib/op-attrs/include/op-attrs/ops/reduction_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/reduction_attrs.struct.toml
index 2798a85caf..1ae2dcdc75 100644
--- a/lib/op-attrs/include/op-attrs/ops/reduction_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/reduction_attrs.struct.toml
@@ -10,9 +10,9 @@ features = [
 ]
 
 includes = [
-  "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 [[fields]]
 name = "reduction_degree"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
diff --git a/lib/op-attrs/include/op-attrs/ops/repartition_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/repartition_attrs.struct.toml
index 965c40c05a..9f08a13fcf 100644
--- a/lib/op-attrs/include/op-attrs/ops/repartition_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/repartition_attrs.struct.toml
@@ -12,7 +12,7 @@ features = [
 includes = [
   "op-attrs/ff_dim_t.h",
   "op-attrs/ff_dim_t.dtg.h",
-  "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 [[fields]]
@@ -21,4 +21,4 @@ type = "::FlexFlow::ff_dim_t"
 
 [[fields]]
 name = "repartition_degree"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
diff --git a/lib/op-attrs/include/op-attrs/ops/replicate_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/replicate_attrs.struct.toml
index 58e365c0f2..739f0edfb4 100644
--- a/lib/op-attrs/include/op-attrs/ops/replicate_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/replicate_attrs.struct.toml
@@ -10,9 +10,9 @@ features = [
 ]
 
 includes = [ 
-  "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 [[fields]]
 name = "replicate_degree"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
diff --git a/lib/op-attrs/include/op-attrs/ops/topk_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/topk_attrs.struct.toml
index 1c5bfc8e10..8feaff4dc0 100644
--- a/lib/op-attrs/include/op-attrs/ops/topk_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/topk_attrs.struct.toml
@@ -10,12 +10,12 @@ features = [
 ]
 
 includes = [
-  "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 [[fields]]
 name = "k"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "sorted"
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml
index d68ef02ec1..e25627f709 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml
@@ -13,7 +13,7 @@ includes = [
   "op-attrs/parallel_tensor_shape/sum_degree.dtg.h",
   "op-attrs/parallel_tensor_shape/discard_copy_degree.dtg.h",
   "op-attrs/ff_ordered/ff_ordered.h",
-  "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 [[fields]]
@@ -26,4 +26,4 @@ type = "::FlexFlow::DiscardCopyDegree"
 
 [[fields]]
 name = "shard_degrees"
-type = "::FlexFlow::FFOrdered<::FlexFlow::nonnegative_int>"
+type = "::FlexFlow::FFOrdered<::FlexFlow::positive_int>"
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
index 67864e637b..bb374d98ee 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
@@ -9,7 +9,7 @@
 namespace FlexFlow {
 
 FFOrdered<ShardParallelDim> ff_ordered_shard_dims(ParallelTensorDims const &);
-FFOrdered<nonnegative_int> ff_ordered_shard_degrees(ParallelTensorDims const &);
+FFOrdered<positive_int> ff_ordered_shard_degrees(ParallelTensorDims const &);
 std::unordered_set<ReplicaParallelDim> replica_dims(ParallelTensorDims const &);
 
 /* size_t get_volume(ParallelTensorDims const &); */
@@ -22,14 +22,14 @@ ParallelTensorDims lift_to_parallel_with_degrees(
     TensorDims const &,
     SumDegree const &,
     DiscardCopyDegree const &,
-    FFOrdered<nonnegative_int> const &shard_degrees);
+    FFOrdered<positive_int> const &shard_degrees);
 ParallelTensorDims
     lift_to_parallel_with_degrees(TensorDims const &,
                                   ParallelTensorDimDegrees const &);
 
-nonnegative_int total_replica_degree(ParallelTensorDims const &);
-nonnegative_int total_shard_degree(ParallelTensorDims const &);
-nonnegative_int total_parallel_degree(ParallelTensorDims const &);
+positive_int total_replica_degree(ParallelTensorDims const &);
+positive_int total_shard_degree(ParallelTensorDims const &);
+positive_int total_parallel_degree(ParallelTensorDims const &);
 
 ShardParallelDim shard_dim_at_idx(ParallelTensorDims const &,
                                   relative_ff_dim_t);
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
index d461ffc9e4..96d9bfb06a 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
@@ -17,7 +17,7 @@ ShardParallelDim shard_dim_at_idx(ParallelTensorShape const &,
                                   relative_ff_dim_t);
 ShardParallelDim &shard_dim_at_idx(ParallelTensorShape &, relative_ff_dim_t);
 
-FFOrdered<nonnegative_int>
+FFOrdered<positive_int>
     ff_ordered_shard_degrees(ParallelTensorShape const &);
 
 std::optional<ShardParallelDim>
@@ -30,7 +30,7 @@ ParallelTensorShape lift_to_parallel_with_degrees(
     TensorShape const &,
     SumDegree const &,
     DiscardCopyDegree const &,
-    FFOrdered<nonnegative_int> const &shard_degrees);
+    FFOrdered<positive_int> const &shard_degrees);
 ParallelTensorShape
     lift_to_parallel_with_degrees(TensorShape const &,
                                   ParallelTensorDimDegrees const &);
@@ -38,13 +38,13 @@ ParallelTensorShape
 std::unordered_set<ReplicaParallelDim>
     replica_dims(ParallelTensorShape const &);
 TensorShape get_piece_shape(ParallelTensorShape const &);
-nonnegative_int get_num_replica_dims(ParallelTensorShape const &);
-nonnegative_int get_num_replicas(ParallelTensorShape const &);
+positive_int get_num_replica_dims(ParallelTensorShape const &);
+positive_int get_num_replicas(ParallelTensorShape const &);
 
-nonnegative_int get_sum_degree(ParallelTensorShape const &);
-nonnegative_int get_discard_copy_degree(ParallelTensorShape const &);
+positive_int get_sum_degree(ParallelTensorShape const &);
+positive_int get_discard_copy_degree(ParallelTensorShape const &);
 
-nonnegative_int get_total_parallel_degree(ParallelTensorShape const &);
+positive_int get_total_parallel_degree(ParallelTensorShape const &);
 
 bool is_valid(ParallelTensorShape const &);
 
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape/discard_copy_degree.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_shape/discard_copy_degree.struct.toml
index 76b52bcdef..d60495bc3a 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_shape/discard_copy_degree.struct.toml
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_shape/discard_copy_degree.struct.toml
@@ -10,9 +10,9 @@ features = [
 ]
 
 includes = [
-  "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 [[fields]]
 name = "value"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape/sum_degree.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_shape/sum_degree.struct.toml
index 550a384ba9..f16586c4c9 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_shape/sum_degree.struct.toml
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_shape/sum_degree.struct.toml
@@ -10,9 +10,9 @@ features = [
 ]
 
 includes = [
-  "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 [[fields]]
 name = "value"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
diff --git a/lib/op-attrs/include/op-attrs/replica_parallel_dim.struct.toml b/lib/op-attrs/include/op-attrs/replica_parallel_dim.struct.toml
index 5ca486181e..ac4c2563dc 100644
--- a/lib/op-attrs/include/op-attrs/replica_parallel_dim.struct.toml
+++ b/lib/op-attrs/include/op-attrs/replica_parallel_dim.struct.toml
@@ -11,12 +11,12 @@ features = [
 
 includes = [
   "op-attrs/replica_type.dtg.h",
-  "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 [[fields]]
 name = "degree"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "replica_type"
diff --git a/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h b/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h
index 92d2b0abb2..85cea57523 100644
--- a/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h
+++ b/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h
@@ -8,11 +8,10 @@
 namespace FlexFlow {
 
 ReplicaParallelDimSet empty_replica_parallel_dim_set();
-nonnegative_int get_degree_of_replica_type(ReplicaParallelDimSet const &,
+positive_int get_degree_of_replica_type(ReplicaParallelDimSet const &,
                                            ReplicaType);
 std::unordered_set<ReplicaParallelDim>
     get_replica_dims(ReplicaParallelDimSet const &);
-bool is_valid(ReplicaParallelDimSet const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/op-attrs/include/op-attrs/shard_parallel_dim.struct.toml b/lib/op-attrs/include/op-attrs/shard_parallel_dim.struct.toml
index 5c5d2dc5b2..a11897070f 100644
--- a/lib/op-attrs/include/op-attrs/shard_parallel_dim.struct.toml
+++ b/lib/op-attrs/include/op-attrs/shard_parallel_dim.struct.toml
@@ -10,13 +10,13 @@ features = [
 ]
 
 includes = [
-  "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 [[fields]]
 name = "size"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "degree"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
diff --git a/lib/op-attrs/include/op-attrs/tensor_dims.h b/lib/op-attrs/include/op-attrs/tensor_dims.h
index ba35295e09..a21602e28c 100644
--- a/lib/op-attrs/include/op-attrs/tensor_dims.h
+++ b/lib/op-attrs/include/op-attrs/tensor_dims.h
@@ -6,12 +6,12 @@
 
 namespace FlexFlow {
 
-FFOrdered<nonnegative_int> const &ff_ordered(TensorDims const &);
+FFOrdered<positive_int> const &ff_ordered(TensorDims const &);
 
 nonnegative_int num_dims(TensorDims const &);
-nonnegative_int dim_at_idx(TensorDims const &, relative_ff_dim_t);
-nonnegative_int &dim_at_idx(TensorDims &, relative_ff_dim_t);
-nonnegative_int get_num_elements(TensorDims const &);
+positive_int dim_at_idx(TensorDims const &, relative_ff_dim_t);
+positive_int &dim_at_idx(TensorDims &, relative_ff_dim_t);
+positive_int get_num_elements(TensorDims const &);
 
 bool tensor_dims_is_broadcastable_to(TensorDims const &curr,
                                      TensorDims const &goal);
diff --git a/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml b/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml
index 8c6d1098cc..a1039798c9 100644
--- a/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml
+++ b/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml
@@ -11,9 +11,9 @@ features = [
 
 includes = [
   "op-attrs/ff_ordered/ff_ordered.h",
-  "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 [[fields]]
 name = "ff_ordered"
-type = "::FlexFlow::FFOrdered<::FlexFlow::nonnegative_int>"
+type = "::FlexFlow::FFOrdered<::FlexFlow::positive_int>"
diff --git a/lib/op-attrs/include/op-attrs/tensor_shape.h b/lib/op-attrs/include/op-attrs/tensor_shape.h
index 298ea04638..3cafdda4b8 100644
--- a/lib/op-attrs/include/op-attrs/tensor_shape.h
+++ b/lib/op-attrs/include/op-attrs/tensor_shape.h
@@ -6,10 +6,10 @@
 namespace FlexFlow {
 
 nonnegative_int num_dims(TensorShape const &);
-nonnegative_int dim_at_idx(TensorShape const &, relative_ff_dim_t);
-nonnegative_int &dim_at_idx(TensorShape &, relative_ff_dim_t);
-nonnegative_int get_num_elements(TensorShape const &);
-nonnegative_int get_size_in_bytes(TensorShape const &);
+positive_int dim_at_idx(TensorShape const &, relative_ff_dim_t);
+positive_int &dim_at_idx(TensorShape &, relative_ff_dim_t);
+positive_int get_num_elements(TensorShape const &);
+positive_int get_size_in_bytes(TensorShape const &);
 
 TensorShape slice_tensor_shape(TensorShape const &,
                                relative_ff_dim_t const &start,
diff --git a/lib/op-attrs/src/op-attrs/datatype.cc b/lib/op-attrs/src/op-attrs/datatype.cc
index f8791521ab..d9e4a65f13 100644
--- a/lib/op-attrs/src/op-attrs/datatype.cc
+++ b/lib/op-attrs/src/op-attrs/datatype.cc
@@ -5,20 +5,20 @@
 
 namespace FlexFlow {
 
-nonnegative_int size_of_datatype(DataType data_type) {
+positive_int size_of_datatype(DataType data_type) {
   switch (data_type) {
     case DataType::BOOL:
-      return nonnegative_int{sizeof(bool)};
+      return positive_int{sizeof(bool)};
     case DataType::INT32:
-      return nonnegative_int{sizeof(int32_t)};
+      return positive_int{sizeof(int32_t)};
     case DataType::INT64:
-      return nonnegative_int{sizeof(int64_t)};
+      return positive_int{sizeof(int64_t)};
     case DataType::HALF:
-      return nonnegative_int{sizeof(float)} / 2_n;
+      return positive_int{sizeof(float) / 2};
     case DataType::FLOAT:
-      return nonnegative_int{sizeof(float)};
+      return positive_int{sizeof(float)};
     case DataType::DOUBLE:
-      return nonnegative_int{sizeof(double)};
+      return positive_int{sizeof(double)};
     default:
       throw mk_runtime_error(fmt::format("Unknown DataType {}", data_type));
   }
diff --git a/lib/op-attrs/src/op-attrs/datatype_value.cc b/lib/op-attrs/src/op-attrs/datatype_value.cc
index 4604ef0b4e..dfb77dac5d 100644
--- a/lib/op-attrs/src/op-attrs/datatype_value.cc
+++ b/lib/op-attrs/src/op-attrs/datatype_value.cc
@@ -1,4 +1,5 @@
 #include "op-attrs/datatype_value.h"
+#include "utils/overload.h"
 
 namespace FlexFlow {
 
@@ -22,4 +23,14 @@ DataTypeValue make_bool_data_type_value(bool value) {
   return DataTypeValue{value};
 }
 
+DataType get_data_type_of_data_type_value(DataTypeValue value) {
+  return value.visit<DataType>(overload {
+    [](float) { return DataType::FLOAT; },
+    [](double) { return DataType::DOUBLE; },
+    [](int32_t) { return DataType::INT32; },
+    [](int64_t) { return DataType::INT64; },
+    [](bool) { return DataType::BOOL; },
+  });
+}
+
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/initializer_attrs.cc b/lib/op-attrs/src/op-attrs/initializer_attrs.cc
index 7635f170a0..b24b28a339 100644
--- a/lib/op-attrs/src/op-attrs/initializer_attrs.cc
+++ b/lib/op-attrs/src/op-attrs/initializer_attrs.cc
@@ -10,12 +10,12 @@ InitializerAttrs make_zero_initializer() {
 // fan_in and fan_out calculation from pytorch
 // see
 // https://github.com/pytorch/pytorch/blob/bd019c0bb485904a99fb38589444b1461ab1e486/torch/nn/init.py#L345-L363
-static nonnegative_int calculate_fan_for_mode(TensorDims const &dims,
-                                              KaimingInitializerMode mode) {
-  nonnegative_int num_input_fmaps = dim_at_idx(dims, relative_ff_dim_t{0});
-  nonnegative_int num_output_fmaps = dim_at_idx(dims, relative_ff_dim_t{1});
+static positive_int calculate_fan_for_mode(TensorDims const &dims,
+                                           KaimingInitializerMode mode) {
+  positive_int num_input_fmaps = dim_at_idx(dims, relative_ff_dim_t{0});
+  positive_int num_output_fmaps = dim_at_idx(dims, relative_ff_dim_t{1});
 
-  nonnegative_int receptive_field_size = get_num_elements(
+  positive_int receptive_field_size = get_num_elements(
       slice_tensor_dims(dims, relative_ff_dim_t{2}, std::nullopt));
 
   if (mode == KaimingInitializerMode::FAN_IN) {
@@ -52,9 +52,9 @@ InitializerAttrs kaiming_uniform(TensorDims const &dims,
                                  KaimingInitializerNonlinearity nonlinearity,
                                  int seed) {
 
-  nonnegative_int fan = calculate_fan_for_mode(dims, mode);
+  positive_int fan = calculate_fan_for_mode(dims, mode);
   float gain = gain_for_nonlinearity(nonlinearity, a);
-  float std = gain / sqrtf(static_cast<float>(fan.unwrap_nonnegative()));
+  float std = gain / sqrtf(static_cast<float>(fan.int_from_positive_int()));
   float bound = sqrtf(3.0) * std;
 
   return InitializerAttrs{UniformInitializerAttrs{
diff --git a/lib/op-attrs/src/op-attrs/initializers/kaiming_initializer_mode.cc b/lib/op-attrs/src/op-attrs/initializers/kaiming_initializer_mode.cc
index b3d6e93c25..789903dc66 100644
--- a/lib/op-attrs/src/op-attrs/initializers/kaiming_initializer_mode.cc
+++ b/lib/op-attrs/src/op-attrs/initializers/kaiming_initializer_mode.cc
@@ -3,12 +3,12 @@
 
 namespace FlexFlow {
 
-nonnegative_int calculate_fan_for_mode(TensorDims const &dims,
+positive_int calculate_fan_for_mode(TensorDims const &dims,
                                        KaimingInitializerMode mode) {
-  nonnegative_int num_input_fmaps = dim_at_idx(dims, relative_ff_dim_t{0});
-  nonnegative_int num_output_fmaps = dim_at_idx(dims, relative_ff_dim_t{1});
+  positive_int num_input_fmaps = dim_at_idx(dims, relative_ff_dim_t{0});
+  positive_int num_output_fmaps = dim_at_idx(dims, relative_ff_dim_t{1});
 
-  nonnegative_int receptive_field_size = get_num_elements(
+  positive_int receptive_field_size = get_num_elements(
       slice_tensor_dims(dims, relative_ff_dim_t{2}, std::nullopt));
 
   if (mode == KaimingInitializerMode::FAN_IN) {
diff --git a/lib/op-attrs/src/op-attrs/ops/attention.cc b/lib/op-attrs/src/op-attrs/ops/attention.cc
index 07d4f3e287..c5678e7bde 100644
--- a/lib/op-attrs/src/op-attrs/ops/attention.cc
+++ b/lib/op-attrs/src/op-attrs/ops/attention.cc
@@ -17,82 +17,82 @@ namespace FlexFlow {
 /*   return is_valid; */
 /* } */
 
-nonnegative_int get_qProjSize(MultiHeadAttentionAttrs const &attrs) {
+positive_int get_qProjSize(MultiHeadAttentionAttrs const &attrs) {
   return attrs.kdim;
 }
 
-nonnegative_int get_vProjSize(MultiHeadAttentionAttrs const &attrs) {
+positive_int get_vProjSize(MultiHeadAttentionAttrs const &attrs) {
   return attrs.vdim;
 }
 
-nonnegative_int get_kProjSize(MultiHeadAttentionAttrs const &attrs) {
+positive_int get_kProjSize(MultiHeadAttentionAttrs const &attrs) {
   return attrs.kdim;
 }
 
-nonnegative_int get_oProjSize(MultiHeadAttentionAttrs const &attrs) {
+positive_int get_oProjSize(MultiHeadAttentionAttrs const &attrs) {
   return attrs.embed_dim;
 }
 
-nonnegative_int get_qSize(TensorShape const &query_shape) {
+positive_int get_qSize(TensorShape const &query_shape) {
   return dim_at_idx(query_shape, relative_ff_dim_t{0});
 }
 
-nonnegative_int get_kSize(TensorShape const &key_shape) {
+positive_int get_kSize(TensorShape const &key_shape) {
   return dim_at_idx(key_shape, relative_ff_dim_t{0});
 }
 
-nonnegative_int get_vSize(TensorShape const &value_shape) {
+positive_int get_vSize(TensorShape const &value_shape) {
   return dim_at_idx(value_shape, relative_ff_dim_t{0});
 }
 
-nonnegative_int get_qSize(MultiHeadAttentionParallelInputs const &inputs) {
+positive_int get_qSize(MultiHeadAttentionParallelInputs const &inputs) {
   return inputs.query_dim.size;
 }
 
-nonnegative_int get_qSize(MultiHeadAttentionInputs const &inputs) {
+positive_int get_qSize(MultiHeadAttentionInputs const &inputs) {
   return inputs.query_size;
 }
 
-nonnegative_int get_kSize(MultiHeadAttentionParallelInputs const &inputs) {
+positive_int get_kSize(MultiHeadAttentionParallelInputs const &inputs) {
   return inputs.key_dim.size;
 }
 
-nonnegative_int get_kSize(MultiHeadAttentionInputs const &inputs) {
+positive_int get_kSize(MultiHeadAttentionInputs const &inputs) {
   return inputs.key_size;
 }
 
-nonnegative_int get_vSize(MultiHeadAttentionParallelInputs const &inputs) {
+positive_int get_vSize(MultiHeadAttentionParallelInputs const &inputs) {
   return inputs.value_dim.size;
 }
 
-nonnegative_int get_vSize(MultiHeadAttentionInputs const &inputs) {
+positive_int get_vSize(MultiHeadAttentionInputs const &inputs) {
   return inputs.value_size;
 }
 
-nonnegative_int
+positive_int
     get_kvSeqLength(MultiHeadAttentionParallelInputs const &inputs) {
   return inputs.sequence_dim.size;
 }
 
-nonnegative_int get_kvSeqLength(MultiHeadAttentionInputs const &inputs) {
+positive_int get_kvSeqLength(MultiHeadAttentionInputs const &inputs) {
   return inputs.sequence_length;
 }
 
-nonnegative_int
+positive_int
     get_qoSeqLength(MultiHeadAttentionParallelInputs const &inputs) {
   return inputs.sequence_dim.size; // FIXME -- assumes only prefill
 }
 
-nonnegative_int get_qoSeqLength(MultiHeadAttentionInputs const &inputs) {
+positive_int get_qoSeqLength(MultiHeadAttentionInputs const &inputs) {
   return inputs.sequence_length; // FIXME -- assumes only prefil
 }
 
-nonnegative_int
+positive_int
     get_num_samples(MultiHeadAttentionParallelInputs const &inputs) {
   return inputs.batch_dim.size;
 }
 
-nonnegative_int get_num_samples(MultiHeadAttentionInputs const &inputs) {
+positive_int get_num_samples(MultiHeadAttentionInputs const &inputs) {
   return inputs.batch_size;
 }
 
@@ -139,7 +139,7 @@ tl::expected<TensorShape, std::string>
   MultiHeadAttentionInputs parsed = parse_result.value();
 
   return TensorShape{
-      TensorDims{FFOrdered<nonnegative_int>{
+      TensorDims{FFOrdered<positive_int>{
           parsed.batch_size,
           parsed.sequence_length,
           attrs.embed_dim,
@@ -164,20 +164,20 @@ tl::expected<TensorShape, std::string>
   MultiHeadAttentionInputs parsed = parse_result.value();
 
   // W^Q_i in "Attention Is All You Need" top of page 5
-  nonnegative_int qProjectWeightSize = parsed.query_size * attrs.kdim;
+  positive_int qProjectWeightSize = parsed.query_size * attrs.kdim;
 
   // W^K_i in "Attention Is All You Need" top of page 5 (all i's put together)
-  nonnegative_int kProjectWeightSize = parsed.key_size * attrs.kdim;
+  positive_int kProjectWeightSize = parsed.key_size * attrs.kdim;
 
   // W^V_i in "Attention Is All You Need" top of page 5 (all i's put together)
-  nonnegative_int vProjectWeightSize = parsed.value_size * attrs.vdim;
+  positive_int vProjectWeightSize = parsed.value_size * attrs.vdim;
 
   // W^O in "Attention Is All You Need" top of page 5, with num_heads factored
   // out
-  nonnegative_int outWeightSize = attrs.vdim * attrs.embed_dim;
+  positive_int outWeightSize = attrs.vdim * attrs.embed_dim;
 
   return TensorShape{
-      TensorDims{FFOrdered<nonnegative_int>{
+      TensorDims{FFOrdered<positive_int>{
           (qProjectWeightSize + kProjectWeightSize + vProjectWeightSize +
            outWeightSize),
           attrs.num_heads,
@@ -203,7 +203,7 @@ tl::expected<TensorShape, std::string>
   });
 
   return TensorShape{
-      TensorDims{FFOrdered<nonnegative_int>{
+      TensorDims{FFOrdered<positive_int>{
           attrs.kdim + attrs.kdim + attrs.vdim,
       }},
       parsed.datatype,
@@ -227,7 +227,7 @@ tl::expected<TensorShape, std::string>
   });
 
   return TensorShape{
-      TensorDims{FFOrdered<nonnegative_int>{
+      TensorDims{FFOrdered<positive_int>{
           attrs.embed_dim,
       }},
       parsed.datatype,
@@ -278,14 +278,14 @@ tl::expected<ParallelTensorShape, std::string>
   }
   TensorShape unpar_shape = result_unpar_get_shape.value();
 
-  nonnegative_int joined_dim_degree = 1_n;
-  nonnegative_int head_dim_degree = parsed.discard_copy_degree.value;
+  positive_int joined_dim_degree = 1_p;
+  positive_int head_dim_degree = parsed.discard_copy_degree.value;
 
   return lift_to_parallel_with_degrees(
       unpar_shape,
-      SumDegree{1_n},
+      SumDegree{1_p},
       DiscardCopyDegree{parsed.batch_dim.degree},
-      FFOrdered<nonnegative_int>{joined_dim_degree, head_dim_degree});
+      FFOrdered<positive_int>{joined_dim_degree, head_dim_degree});
 }
 
 tl::expected<ParallelTensorShape, std::string>
@@ -318,10 +318,10 @@ tl::expected<ParallelTensorShape, std::string>
     result_unpar.value();
   });
 
-  SumDegree sum_degree = SumDegree{1_n};
+  SumDegree sum_degree = SumDegree{1_p};
   DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{
       parsed.batch_dim.degree * parsed.discard_copy_degree.value};
-  FFOrdered<nonnegative_int> shard_degrees = FFOrdered<nonnegative_int>{1_n};
+  FFOrdered<positive_int> shard_degrees = FFOrdered<positive_int>{1_p};
   return lift_to_parallel_with_degrees(
       unpar_shape, sum_degree, discard_copy_degree, shard_degrees);
 }
@@ -356,10 +356,10 @@ tl::expected<ParallelTensorShape, std::string>
     result_unpar.value();
   });
 
-  SumDegree sum_degree = SumDegree{1_n};
+  SumDegree sum_degree = SumDegree{1_p};
   DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{
       parsed.batch_dim.degree * parsed.discard_copy_degree.value};
-  FFOrdered<nonnegative_int> shard_degrees = FFOrdered<nonnegative_int>{1_n};
+  FFOrdered<positive_int> shard_degrees = FFOrdered<positive_int>{1_p};
   return lift_to_parallel_with_degrees(
       unpar_shape, sum_degree, discard_copy_degree, shard_degrees);
 }
@@ -388,24 +388,24 @@ tl::expected<ParallelTensorShape, std::string>
   }
   TensorShape unpar_shape = result_unpar_get_shape.value();
 
-  nonnegative_int sum_degree = parsed.discard_copy_degree.value;
-  nonnegative_int discard_copy_degree = 1_n;
-  nonnegative_int batch_degree = parsed.batch_dim.degree;
-  nonnegative_int seq_len_degree = 1_n;
-  nonnegative_int out_dim_degree = 1_n;
+  positive_int sum_degree = parsed.discard_copy_degree.value;
+  positive_int discard_copy_degree = 1_p;
+  positive_int batch_degree = parsed.batch_dim.degree;
+  positive_int seq_len_degree = 1_p;
+  positive_int out_dim_degree = 1_p;
 
   return lift_to_parallel_with_degrees(
       unpar_shape,
       SumDegree{sum_degree},
       DiscardCopyDegree{discard_copy_degree},
-      FFOrdered<nonnegative_int>{batch_degree, seq_len_degree, out_dim_degree});
+      FFOrdered{batch_degree, seq_len_degree, out_dim_degree});
 }
 
-nonnegative_int get_oSize(ParallelTensorShape const &) {
+positive_int get_oSize(ParallelTensorShape const &) {
   NOT_IMPLEMENTED();
 }
 
-nonnegative_int get_oSize(TensorShape const &) {
+positive_int get_oSize(TensorShape const &) {
   NOT_IMPLEMENTED();
 }
 
diff --git a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc
index b9049bf461..7bf3b9d91e 100644
--- a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc
+++ b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc
@@ -31,9 +31,9 @@ tl::expected<MultiHeadAttentionInputs, std::string>
                     3));
   }
 
-  nonnegative_int seq_len_q = dim_at_idx(input_q, relative_ff_dim_t{-2});
-  nonnegative_int seq_len_k = dim_at_idx(input_k, relative_ff_dim_t{-2});
-  nonnegative_int seq_len_v = dim_at_idx(input_v, relative_ff_dim_t{-2});
+  positive_int seq_len_q = dim_at_idx(input_q, relative_ff_dim_t{-2});
+  positive_int seq_len_k = dim_at_idx(input_k, relative_ff_dim_t{-2});
+  positive_int seq_len_v = dim_at_idx(input_v, relative_ff_dim_t{-2});
 
   if (!all_same(seq_len_q, seq_len_k, seq_len_v)) {
     return tl::unexpected(fmt::format(
@@ -43,9 +43,9 @@ tl::expected<MultiHeadAttentionInputs, std::string>
         seq_len_v));
   }
 
-  nonnegative_int batch_size_q = dim_at_idx(input_q, relative_ff_dim_t{-3});
-  nonnegative_int batch_size_k = dim_at_idx(input_k, relative_ff_dim_t{-3});
-  nonnegative_int batch_size_v = dim_at_idx(input_v, relative_ff_dim_t{-3});
+  positive_int batch_size_q = dim_at_idx(input_q, relative_ff_dim_t{-3});
+  positive_int batch_size_k = dim_at_idx(input_k, relative_ff_dim_t{-3});
+  positive_int batch_size_v = dim_at_idx(input_v, relative_ff_dim_t{-3});
 
   if (!all_same(batch_size_q, batch_size_k, batch_size_v)) {
     return tl::unexpected(fmt::format(
@@ -63,9 +63,9 @@ tl::expected<MultiHeadAttentionInputs, std::string>
         input_v.data_type));
   }
 
-  nonnegative_int q_size = dim_at_idx(input_q, relative_ff_dim_t{-1});
-  nonnegative_int k_size = dim_at_idx(input_k, relative_ff_dim_t{-1});
-  nonnegative_int v_size = dim_at_idx(input_v, relative_ff_dim_t{-1});
+  positive_int q_size = dim_at_idx(input_q, relative_ff_dim_t{-1});
+  positive_int k_size = dim_at_idx(input_k, relative_ff_dim_t{-1});
+  positive_int v_size = dim_at_idx(input_v, relative_ff_dim_t{-1});
 
   return MultiHeadAttentionInputs{
       batch_size_q,
diff --git a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_parallel_inputs.cc b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_parallel_inputs.cc
index d69b62b759..3225f1aef2 100644
--- a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_parallel_inputs.cc
+++ b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_parallel_inputs.cc
@@ -107,9 +107,9 @@ tl::expected<MultiHeadAttentionParallelInputs, std::string>
                     value_dim.degree));
   }
 
-  nonnegative_int discard_copy_q = get_discard_copy_degree(input_q);
-  nonnegative_int discard_copy_k = get_discard_copy_degree(input_k);
-  nonnegative_int discard_copy_v = get_discard_copy_degree(input_v);
+  positive_int discard_copy_q = get_discard_copy_degree(input_q);
+  positive_int discard_copy_k = get_discard_copy_degree(input_k);
+  positive_int discard_copy_v = get_discard_copy_degree(input_v);
 
   if (!all_same(discard_copy_q, discard_copy_k, discard_copy_v)) {
     return tl::unexpected(fmt::format("Q, K, V disagree on the discard-copy "
diff --git a/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc b/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc
index d32ae33d14..d11a8aba10 100644
--- a/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc
+++ b/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc
@@ -57,13 +57,13 @@ tl::expected<TensorShape, std::string>
                                       input_rhs.data_type));
   }
 
-  nonnegative_int lhs_b = dim_at_idx(input_lhs, relative_ff_dim_t{0});
-  nonnegative_int n = dim_at_idx(input_lhs, relative_ff_dim_t{1});
-  nonnegative_int lhs_m = dim_at_idx(input_lhs, relative_ff_dim_t{2});
+  positive_int lhs_b = dim_at_idx(input_lhs, relative_ff_dim_t{0});
+  positive_int n = dim_at_idx(input_lhs, relative_ff_dim_t{1});
+  positive_int lhs_m = dim_at_idx(input_lhs, relative_ff_dim_t{2});
 
-  nonnegative_int rhs_b = dim_at_idx(input_rhs, relative_ff_dim_t{0});
-  nonnegative_int rhs_m = dim_at_idx(input_rhs, relative_ff_dim_t{1});
-  nonnegative_int p = dim_at_idx(input_rhs, relative_ff_dim_t{2});
+  positive_int rhs_b = dim_at_idx(input_rhs, relative_ff_dim_t{0});
+  positive_int rhs_m = dim_at_idx(input_rhs, relative_ff_dim_t{1});
+  positive_int p = dim_at_idx(input_rhs, relative_ff_dim_t{2});
 
   if (lhs_b != rhs_b) {
     return tl::unexpected(
@@ -76,7 +76,7 @@ tl::expected<TensorShape, std::string>
 
   return TensorShape{
       TensorDims{
-          FFOrdered<nonnegative_int>{
+          FFOrdered<positive_int>{
               lhs_b,
               n,
               p,
@@ -151,10 +151,11 @@ tl::expected<ParallelTensorShape, std::string>
   ShardParallelDim output_n = n;
   ShardParallelDim output_p = p;
 
-  nonnegative_int output_discard_copy_degree = 1_n;
-  nonnegative_int output_sum_degree =
+  positive_int output_discard_copy_degree = 1_p;
+  positive_int output_sum_degree = positive_int{
       get_total_parallel_degree(input_lhs) /
-      (output_b.degree * output_n.degree * output_p.degree);
+      (output_b.degree * output_n.degree * output_p.degree)
+  };
 
   ParallelTensorShape result = ParallelTensorShape{
       ParallelTensorDims{
diff --git a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc
index ddd92bd417..f42467320b 100644
--- a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc
+++ b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc
@@ -68,10 +68,10 @@ tl::expected<TensorShape, std::string>
     return tl::unexpected("No gamma weights exist for attrs.affine = false");
   }
 
-  nonnegative_int num_channels = dim_at_idx(input_shape, relative_ff_dim_t{1});
+  positive_int num_channels = dim_at_idx(input_shape, relative_ff_dim_t{1});
 
   return TensorShape{
-      TensorDims{FFOrdered<nonnegative_int>{
+      TensorDims{FFOrdered<positive_int>{
           num_channels,
       }},
       DataType::FLOAT,
@@ -113,23 +113,23 @@ static std::optional<std::string>
                        input_degrees);
   }
 
-  if (input_degrees.sum_degree != SumDegree{1_n}) {
+  if (input_degrees.sum_degree != SumDegree{1_p}) {
     return fmt::format("Expected sum degree 1, but receieved sum degree {}",
                        input_degrees.sum_degree);
   }
 
-  if (input_degrees.discard_copy_degree != DiscardCopyDegree{1_n}) {
+  if (input_degrees.discard_copy_degree != DiscardCopyDegree{1_p}) {
     return fmt::format(
         "Expected discard copy degree 1, but receieved discard copy degree {}",
         input_degrees.discard_copy_degree);
   }
 
-  FFOrdered<nonnegative_int> non_channel_degrees =
+  FFOrdered<positive_int> non_channel_degrees =
       concat(slice(input_degrees.shard_degrees, ff_dim_t{0_n}, ff_dim_t{1_n}),
              slice(input_degrees.shard_degrees, ff_dim_t{2_n}, std::nullopt));
 
   if (any_of(non_channel_degrees,
-             [](nonnegative_int degree) { return degree != 1_n; })) {
+             [](positive_int degree) { return degree != 1_p; })) {
     return fmt::format("Expected parallel degree of all non-channel dimensions "
                        "to be 1, but received input with degrees {}",
                        input_degrees);
@@ -172,9 +172,9 @@ tl::expected<ParallelTensorDimDegrees, std::string>
   relative_ff_dim_t channel_dim = relative_ff_dim_t{1};
 
   return ParallelTensorDimDegrees{
-      SumDegree{1_n},
-      DiscardCopyDegree{1_n},
-      FFOrdered<nonnegative_int>{input_degrees.shard_degrees.at(channel_dim)},
+      SumDegree{1_p},
+      DiscardCopyDegree{1_p},
+      FFOrdered<positive_int>{input_degrees.shard_degrees.at(channel_dim)},
   };
 }
 
diff --git a/lib/op-attrs/src/op-attrs/ops/combine.cc b/lib/op-attrs/src/op-attrs/ops/combine.cc
index 636f37dcea..c55bdc55bb 100644
--- a/lib/op-attrs/src/op-attrs/ops/combine.cc
+++ b/lib/op-attrs/src/op-attrs/ops/combine.cc
@@ -44,8 +44,10 @@ tl::expected<ParallelTensorShape, std::string>
   }
 
   ParallelTensorShape output = input;
-  shard_dim_at_idx(output, relative_ff_dim_t_from_ff_dim_t(attrs.combine_dim))
-      .degree /= attrs.combine_degree;
+  relative_ff_dim_t combine_dim = relative_ff_dim_t_from_ff_dim_t(attrs.combine_dim);
+  shard_dim_at_idx(output, combine_dim).degree = positive_int{
+    shard_dim_at_idx(output, combine_dim).degree / attrs.combine_degree
+  };
 
   return output;
 }
diff --git a/lib/op-attrs/src/op-attrs/ops/concat.cc b/lib/op-attrs/src/op-attrs/ops/concat.cc
index bf0ba553e4..b41d1ffc32 100644
--- a/lib/op-attrs/src/op-attrs/ops/concat.cc
+++ b/lib/op-attrs/src/op-attrs/ops/concat.cc
@@ -17,7 +17,7 @@ tl::expected<TensorShape, std::string>
     get_output_shape(ConcatAttrs const &attrs,
                      std::vector<TensorShape> const &inputs) {
   auto get_non_axis_dims = [&](TensorShape const &s) {
-    std::map<ff_dim_t, nonnegative_int> dim_sizes =
+    std::map<ff_dim_t, positive_int> dim_sizes =
         enumerate(ff_ordered(s.dims));
     dim_sizes.erase(attrs.axis);
     return dim_sizes;
@@ -41,8 +41,8 @@ tl::expected<TensorShape, std::string>
                     inputs));
   }
 
-  std::map<ff_dim_t, nonnegative_int> non_axis_dims = ({
-    tl::expected<std::map<ff_dim_t, nonnegative_int>, std::string> returned =
+  std::map<ff_dim_t, positive_int> non_axis_dims = ({
+    tl::expected<std::map<ff_dim_t, positive_int>, std::string> returned =
         require_all_same1(transform(inputs, get_non_axis_dims));
     if (!returned.has_value()) {
       return tl::unexpected(returned.error());
@@ -50,12 +50,12 @@ tl::expected<TensorShape, std::string>
     returned.value();
   });
 
-  std::vector<nonnegative_int> axis_dim_sizes =
+  std::vector<positive_int> axis_dim_sizes =
       transform(inputs, [&](TensorShape const &s) {
         return dim_at_idx(s, relative_ff_dim_t_from_ff_dim_t(attrs.axis));
       });
 
-  nonnegative_int output_axis_dim_size = sum(axis_dim_sizes);
+  positive_int output_axis_dim_size = sum(axis_dim_sizes);
 
   non_axis_dims.insert({attrs.axis, output_axis_dim_size});
 
@@ -89,7 +89,7 @@ tl::expected<ParallelTensorShape, std::string>
   });
 
   SumDegree sum_degree = ({
-    tl::expected<nonnegative_int, std::string> returned =
+    tl::expected<positive_int, std::string> returned =
         require_all_same1(transform(inputs, get_sum_degree));
     if (!returned.has_value()) {
       return tl::unexpected(returned.error());
@@ -98,7 +98,7 @@ tl::expected<ParallelTensorShape, std::string>
   });
 
   DiscardCopyDegree discard_copy_degree = ({
-    tl::expected<nonnegative_int, std::string> returned =
+    tl::expected<positive_int, std::string> returned =
         require_all_same1(transform(inputs, get_discard_copy_degree));
     if (!returned.has_value()) {
       return tl::unexpected(returned.error());
diff --git a/lib/op-attrs/src/op-attrs/ops/conv_2d.cc b/lib/op-attrs/src/op-attrs/ops/conv_2d.cc
index 902417d050..af4b6cd898 100644
--- a/lib/op-attrs/src/op-attrs/ops/conv_2d.cc
+++ b/lib/op-attrs/src/op-attrs/ops/conv_2d.cc
@@ -27,7 +27,7 @@ TensorShape get_kernel_shape(Conv2DAttrs const &attrs,
   Conv2DInputShape input = parse_input_shape(raw_input_shape);
 
   return TensorShape{
-      TensorDims{FFOrdered<nonnegative_int>{
+      TensorDims{FFOrdered<positive_int>{
           attrs.out_channels,
           input.num_channels,
           attrs.kernel_h,
@@ -44,22 +44,22 @@ TensorShape get_bias_shape(Conv2DAttrs const &attrs,
 
   return TensorShape{
       TensorDims{
-          FFOrdered<nonnegative_int>{attrs.out_channels},
+          FFOrdered<positive_int>{attrs.out_channels},
       },
       input.datatype,
   };
 }
 
-static nonnegative_int calculate_output_size(nonnegative_int input_size,
+static positive_int calculate_output_size(positive_int input_size,
                                              nonnegative_int padding_size,
-                                             nonnegative_int kernel_size,
-                                             nonnegative_int stride) {
-  int input_size_raw = input_size.unwrap_nonnegative();
+                                             positive_int kernel_size,
+                                             positive_int stride) {
+  int input_size_raw = input_size.int_from_positive_int();
   int padding_raw = padding_size.unwrap_nonnegative();
-  int kernel_size_raw = kernel_size.unwrap_nonnegative();
-  int stride_raw = stride.unwrap_nonnegative();
+  int kernel_size_raw = kernel_size.int_from_positive_int();
+  int stride_raw = stride.int_from_positive_int();
 
-  return nonnegative_int{
+  return positive_int{
       (input_size_raw + (2 * padding_raw) - kernel_size_raw) / stride_raw + 1};
 }
 
@@ -68,18 +68,18 @@ TensorShape get_output_shape(Conv2DAttrs const &attrs,
   assert(attrs.groups == 1); // TODO(@lockshaw): currently not supported
   Conv2DInputShape input = parse_input_shape(raw_input_shape);
 
-  nonnegative_int out_height =
+  positive_int out_height =
       calculate_output_size(/*input_size=*/input.height,
                             /*padding_size=*/attrs.padding_h,
                             /*kernel_size=*/attrs.kernel_h,
                             /*stride_size=*/attrs.stride_h);
-  nonnegative_int out_width =
+  positive_int out_width =
       calculate_output_size(/*input_size=*/input.width,
                             /*padding_size=*/attrs.padding_w,
                             /*kernel_size=*/attrs.kernel_w,
                             /*stride_size=*/attrs.stride_w);
 
-  return TensorShape{TensorDims{FFOrdered<nonnegative_int>{
+  return TensorShape{TensorDims{FFOrdered<positive_int>{
                          input.num_samples,
                          attrs.out_channels,
                          out_height,
@@ -112,14 +112,14 @@ ParallelTensorShape get_kernel_shape(Conv2DAttrs const &attrs,
   assert(parsed.height_dim.degree == 1);
   assert(parsed.width_dim.degree == 1);
 
-  SumDegree sum_degree = SumDegree{1_n};
+  SumDegree sum_degree = SumDegree{1_p};
   DiscardCopyDegree discard_copy_degree =
       DiscardCopyDegree{parsed.sample_dim.degree * parsed.sum_reduction_degree};
-  FFOrdered<nonnegative_int> shard_degrees = {
+  FFOrdered<positive_int> shard_degrees = {
       parsed.discard_copy_reduction_degree,
       parsed.channel_dim.degree,
-      1_n,
-      1_n,
+      1_p,
+      1_p,
   };
 
   return lift_to_parallel_with_degrees(
@@ -139,7 +139,7 @@ ParallelTensorShape get_bias_shape(Conv2DAttrs const &attrs,
   DiscardCopyDegree discard_copy_degree =
       DiscardCopyDegree{parsed.height_dim.degree * parsed.width_dim.degree *
                         parsed.sample_dim.degree};
-  FFOrdered<nonnegative_int> shard_degrees = {
+  FFOrdered<positive_int> shard_degrees = {
       parsed.discard_copy_reduction_degree,
   };
 
@@ -160,12 +160,12 @@ ParallelTensorShape get_output_shape(Conv2DAttrs const &attrs,
 
   SumDegree sum_degree =
       SumDegree{parsed.sum_reduction_degree * parsed.channel_dim.degree};
-  DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1_n};
-  FFOrdered<nonnegative_int> shard_degrees = {
+  DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1_p};
+  FFOrdered<positive_int> shard_degrees = {
       parsed.sample_dim.degree,
       parsed.discard_copy_reduction_degree,
-      1_n,
-      1_n,
+      1_p,
+      1_p,
   };
 
   return lift_to_parallel_with_degrees(
@@ -217,11 +217,10 @@ std::vector<InitializerAttrs>
   InitializerAttrs kernel_initializer =
       maybe_kernel_initializer.value_or(kernel_default_initializer);
 
-  nonnegative_int fan_in =
+  positive_int fan_in =
       calculate_fan_for_mode(kernel_shape.dims, KaimingInitializerMode::FAN_IN);
-  assert(fan_in != 0_n);
 
-  float bound = 1 / sqrtf(static_cast<float>(fan_in.unwrap_nonnegative()));
+  float bound = 1 / sqrtf(static_cast<float>(fan_in.int_from_positive_int()));
 
   InitializerAttrs bias_default_initializer =
       InitializerAttrs{UniformInitializerAttrs{
diff --git a/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc b/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc
index 1491410491..75db5c56fb 100644
--- a/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc
+++ b/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc
@@ -6,10 +6,10 @@ namespace FlexFlow {
 Conv2DInputShape parse_input_shape(TensorShape const &input) {
   assert(num_dims(input) == 4);
 
-  nonnegative_int num_samples = dim_at_idx(input, relative_ff_dim_t{0});
-  nonnegative_int in_channels = dim_at_idx(input, relative_ff_dim_t{1});
-  nonnegative_int in_height = dim_at_idx(input, relative_ff_dim_t{2});
-  nonnegative_int in_width = dim_at_idx(input, relative_ff_dim_t{3});
+  positive_int num_samples = dim_at_idx(input, relative_ff_dim_t{0});
+  positive_int in_channels = dim_at_idx(input, relative_ff_dim_t{1});
+  positive_int in_height = dim_at_idx(input, relative_ff_dim_t{2});
+  positive_int in_width = dim_at_idx(input, relative_ff_dim_t{3});
 
   return Conv2DInputShape{
       num_samples,
diff --git a/lib/op-attrs/src/op-attrs/ops/embedding.cc b/lib/op-attrs/src/op-attrs/ops/embedding.cc
index 5b5b91a8e7..809b4cdaf9 100644
--- a/lib/op-attrs/src/op-attrs/ops/embedding.cc
+++ b/lib/op-attrs/src/op-attrs/ops/embedding.cc
@@ -68,7 +68,7 @@ tl::expected<TensorShape, std::string>
 
   return TensorShape{
       TensorDims{
-          FFOrdered<nonnegative_int>{
+          FFOrdered<positive_int>{
               attrs.num_entries,
               attrs.out_channels,
           },
@@ -92,8 +92,8 @@ tl::expected<ParallelTensorShape, std::string>
 
   SumDegree sum_degree =
       SumDegree{shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree};
-  DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1_n};
-  FFOrdered<nonnegative_int> shard_degrees =
+  DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1_p};
+  FFOrdered<positive_int> shard_degrees =
       transform(input.dims.shard_dims,
                 [](ShardParallelDim const &d) { return d.degree; });
   shard_degrees.at(relative_ff_dim_t{-1}) = get_discard_copy_degree(input);
@@ -114,13 +114,13 @@ tl::expected<ParallelTensorShape, std::string>
     result_unpar.value();
   });
 
-  SumDegree sum_degree = SumDegree{1_n};
+  SumDegree sum_degree = SumDegree{1_p};
   DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{product(transform(
       ff_ordered_shard_dims(input.dims),
-      [](ShardParallelDim const &d) -> nonnegative_int { return d.degree; }))};
-  nonnegative_int entry_dim_degree = 1_n;
-  nonnegative_int out_channel_degree = get_discard_copy_degree(input);
-  FFOrdered<nonnegative_int> shard_degrees = {
+      [](ShardParallelDim const &d) -> positive_int { return d.degree; }))};
+  positive_int entry_dim_degree = 1_p;
+  positive_int out_channel_degree = get_discard_copy_degree(input);
+  FFOrdered<positive_int> shard_degrees = {
       entry_dim_degree,
       out_channel_degree,
   };
diff --git a/lib/op-attrs/src/op-attrs/ops/flat.cc b/lib/op-attrs/src/op-attrs/ops/flat.cc
index b4eeda76ab..a2183a71b4 100644
--- a/lib/op-attrs/src/op-attrs/ops/flat.cc
+++ b/lib/op-attrs/src/op-attrs/ops/flat.cc
@@ -11,11 +11,11 @@ namespace FlexFlow {
 
 TensorShape get_output_shape(FlatAttrs const &attrs,
                              TensorShape const &input_shape) {
-  FFOrdered<nonnegative_int> leading_dims =
+  FFOrdered<positive_int> leading_dims =
       slice(ff_ordered(input_shape.dims), ff_dim_t{0_n}, attrs.start_dim);
-  FFOrdered<nonnegative_int> flattened_dims =
+  FFOrdered<positive_int> flattened_dims =
       slice(ff_ordered(input_shape.dims), attrs.start_dim, attrs.end_dim);
-  FFOrdered<nonnegative_int> trailing_dims =
+  FFOrdered<positive_int> trailing_dims =
       slice(ff_ordered(input_shape.dims), attrs.end_dim, std::nullopt);
 
   if (flattened_dims.empty()) {
@@ -37,7 +37,7 @@ TensorShape get_output_shape(FlatAttrs const &attrs,
 tl::expected<ParallelTensorDimDegrees, std::string>
     get_output_parallel_dim_degrees(
         FlatAttrs const &attrs, ParallelTensorDimDegrees const &input_degrees) {
-  FFOrdered<nonnegative_int> flattened_dim_degrees =
+  FFOrdered<positive_int> flattened_dim_degrees =
       slice(input_degrees.shard_degrees, attrs.start_dim, attrs.end_dim);
 
   if (flattened_dim_degrees.empty()) {
@@ -45,7 +45,7 @@ tl::expected<ParallelTensorDimDegrees, std::string>
   }
 
   if (any_of(flattened_dim_degrees,
-             [](nonnegative_int degree) { return degree != 1; })) {
+             [](positive_int degree) { return degree != 1; })) {
     return tl::unexpected(
         fmt::format("get_output_parallel_dim_degrees for {} expected all shard "
                     "degrees of flattened dimensions to be 1, but received {}",
diff --git a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc
index c9798368e2..3637aacc5c 100644
--- a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc
+++ b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc
@@ -72,7 +72,7 @@ tl::expected<TensorShape, std::string>
   std::vector<ff_dim_t> non_layer_norm_dim_idxs = filter(
       get_idxs(input_shape.dims.ff_ordered),
       [&](ff_dim_t const &dim_idx) { return !contains(attrs.axes, dim_idx); });
-  std::vector<nonnegative_int> raw_weight_dims =
+  std::vector<positive_int> raw_weight_dims =
       transform(non_layer_norm_dim_idxs, [&](ff_dim_t const &dim_idx) {
         return dim_at_idx(input_shape,
                           relative_ff_dim_t_from_ff_dim_t(dim_idx));
@@ -190,8 +190,8 @@ tl::expected<ParallelTensorShape, std::string>
       ParallelTensorDims{
           ff_ordered_of(raw_weight_shard_dims),
           ReplicaParallelDimSet{
-              SumDegree{1_n},
-              DiscardCopyDegree{1_n},
+              SumDegree{1_p},
+              DiscardCopyDegree{1_p},
           },
       },
       DataType::FLOAT,
diff --git a/lib/op-attrs/src/op-attrs/ops/linear.cc b/lib/op-attrs/src/op-attrs/ops/linear.cc
index bee9d0cf4f..578e9ce652 100644
--- a/lib/op-attrs/src/op-attrs/ops/linear.cc
+++ b/lib/op-attrs/src/op-attrs/ops/linear.cc
@@ -44,11 +44,11 @@ RecordFormatter as_dot(LinearAttrs const &attrs) {
 tl::expected<TensorShape, std::string>
     get_projection_shape(LinearAttrs const &attrs,
                          TensorShape const &input_shape) {
-  nonnegative_int in_channels = dim_at_idx(input_shape, relative_ff_dim_t{-1});
+  positive_int in_channels = dim_at_idx(input_shape, relative_ff_dim_t{-1});
 
   return TensorShape{
       TensorDims{
-          FFOrdered<nonnegative_int>{in_channels, attrs.out_channels},
+          FFOrdered<positive_int>{in_channels, attrs.out_channels},
       },
       input_shape.data_type,
   };
@@ -58,7 +58,7 @@ tl::expected<TensorShape, std::string>
     get_bias_shape(LinearAttrs const &attrs, TensorShape const &input_shape) {
   return TensorShape{
       TensorDims{
-          FFOrdered<nonnegative_int>{attrs.out_channels},
+          FFOrdered<positive_int>{attrs.out_channels},
       },
       input_shape.data_type,
   };
@@ -99,12 +99,12 @@ tl::expected<ParallelTensorShape, std::string>
     result_unpar.value();
   });
 
-  SumDegree sum_degree = SumDegree{1_n};
+  SumDegree sum_degree = SumDegree{1_p};
   DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{
       get_sum_degree(input) * product(slice(ff_ordered_shard_degrees(input),
                                             relative_ff_dim_t{0},
                                             relative_ff_dim_t{-1}))};
-  FFOrdered<nonnegative_int> shard_degrees = FFOrdered<nonnegative_int>{
+  FFOrdered<positive_int> shard_degrees = FFOrdered<positive_int>{
       shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree,
       get_discard_copy_degree(input),
   };
@@ -131,8 +131,8 @@ tl::expected<ParallelTensorShape, std::string>
       DiscardCopyDegree{product(slice(ff_ordered_shard_degrees(input),
                                       relative_ff_dim_t{0},
                                       relative_ff_dim_t{-1}))};
-  FFOrdered<nonnegative_int> shard_degrees =
-      FFOrdered<nonnegative_int>{get_discard_copy_degree(input)};
+  FFOrdered<positive_int> shard_degrees =
+      FFOrdered<positive_int>{get_discard_copy_degree(input)};
 
   return lift_to_parallel_with_degrees(
       unpar, sum_degree, discard_copy_degree, shard_degrees);
@@ -153,8 +153,8 @@ tl::expected<ParallelTensorShape, std::string>
   SumDegree sum_degree =
       SumDegree{get_sum_degree(input) *
                 shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree};
-  DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1_n};
-  FFOrdered<nonnegative_int> shard_degrees = ff_ordered_shard_degrees(input);
+  DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1_p};
+  FFOrdered<positive_int> shard_degrees = ff_ordered_shard_degrees(input);
   shard_degrees.at(relative_ff_dim_t{-1}) = get_discard_copy_degree(input);
 
   return lift_to_parallel_with_degrees(
@@ -209,10 +209,10 @@ tl::expected<std::vector<InitializerAttrs>, std::string> get_initializers(
   InitializerAttrs projection_initializer =
       maybe_projection_initializer.value_or(projection_default_initializer);
 
-  nonnegative_int fan_in = calculate_fan_for_mode(
+  positive_int fan_in = calculate_fan_for_mode(
       projection_shape.dims, KaimingInitializerMode::FAN_IN);
 
-  float bound = 1 / sqrtf(static_cast<float>(fan_in.unwrap_nonnegative()));
+  float bound = 1 / sqrtf(static_cast<float>(fan_in.int_from_positive_int()));
 
   InitializerAttrs bias_default_initializer =
       InitializerAttrs{UniformInitializerAttrs{
diff --git a/lib/op-attrs/src/op-attrs/ops/pool_2d.cc b/lib/op-attrs/src/op-attrs/ops/pool_2d.cc
index f9630e16b1..c542d688b3 100644
--- a/lib/op-attrs/src/op-attrs/ops/pool_2d.cc
+++ b/lib/op-attrs/src/op-attrs/ops/pool_2d.cc
@@ -8,8 +8,8 @@ namespace FlexFlow {
 
 tl::expected<Pool2DAttrs, std::string>
     make_adaptive_pool2d_attrs(TensorDims const &input_dims,
-                               nonnegative_int output_h,
-                               nonnegative_int output_w,
+                               positive_int output_h,
+                               positive_int output_w,
                                PoolOp pool_type,
                                std::optional<Activation> const &activation) {
   // AdaptivePool2D semantics pulled from
@@ -22,10 +22,10 @@ tl::expected<Pool2DAttrs, std::string>
                     input_dims));
   }
 
-  nonnegative_int num_samples = dim_at_idx(input_dims, relative_ff_dim_t{0});
-  nonnegative_int num_channels = dim_at_idx(input_dims, relative_ff_dim_t{1});
-  nonnegative_int input_h = dim_at_idx(input_dims, relative_ff_dim_t{2});
-  nonnegative_int input_w = dim_at_idx(input_dims, relative_ff_dim_t{3});
+  positive_int num_samples = dim_at_idx(input_dims, relative_ff_dim_t{0});
+  positive_int num_channels = dim_at_idx(input_dims, relative_ff_dim_t{1});
+  positive_int input_h = dim_at_idx(input_dims, relative_ff_dim_t{2});
+  positive_int input_w = dim_at_idx(input_dims, relative_ff_dim_t{3});
 
   if (input_h % output_h != 0) {
     return tl::unexpected(fmt::format(
@@ -55,11 +55,11 @@ tl::expected<Pool2DAttrs, std::string>
   //               = `ind / outd`
   //               = `stride`
 
-  nonnegative_int kernel_h = input_h / output_h;
-  nonnegative_int kernel_w = input_w / output_w;
+  positive_int kernel_h = positive_int{input_h / output_h};
+  positive_int kernel_w = positive_int{input_w / output_w};
 
-  nonnegative_int stride_h = kernel_h;
-  nonnegative_int stride_w = kernel_w;
+  positive_int stride_h = kernel_h;
+  positive_int stride_w = kernel_w;
 
   Pool2DAttrs attrs = Pool2DAttrs{
       /*kernel_h=*/kernel_h,
@@ -73,7 +73,7 @@ tl::expected<Pool2DAttrs, std::string>
   };
 
   TensorShape expected_ouput_shape = TensorShape{
-      TensorDims{FFOrdered<nonnegative_int>{
+      TensorDims{FFOrdered<positive_int>{
           num_samples,
           num_channels,
           output_h,
@@ -104,16 +104,16 @@ tl::expected<Pool2DAttrs, std::string>
   return attrs;
 }
 
-static nonnegative_int calculate_output_size(nonnegative_int input_size,
+static positive_int calculate_output_size(positive_int input_size,
                                              nonnegative_int padding_size,
-                                             nonnegative_int kernel_size,
-                                             nonnegative_int stride) {
-  int input_size_raw = input_size.unwrap_nonnegative();
+                                             positive_int kernel_size,
+                                             positive_int stride) {
+  int input_size_raw = input_size.int_from_positive_int();
   int padding_raw = padding_size.unwrap_nonnegative();
-  int kernel_size_raw = kernel_size.unwrap_nonnegative();
-  int stride_raw = stride.unwrap_nonnegative();
+  int kernel_size_raw = kernel_size.int_from_positive_int();
+  int stride_raw = stride.int_from_positive_int();
 
-  return nonnegative_int{
+  return positive_int{
       (input_size_raw + (2 * padding_raw) - kernel_size_raw) / stride_raw + 1};
 }
 
@@ -126,23 +126,23 @@ tl::expected<TensorShape, std::string>
                     input_shape));
   }
 
-  nonnegative_int num_samples = dim_at_idx(input_shape, relative_ff_dim_t{0});
-  nonnegative_int num_channels = dim_at_idx(input_shape, relative_ff_dim_t{1});
-  nonnegative_int input_height = dim_at_idx(input_shape, relative_ff_dim_t{2});
-  nonnegative_int input_width = dim_at_idx(input_shape, relative_ff_dim_t{3});
+  positive_int num_samples = dim_at_idx(input_shape, relative_ff_dim_t{0});
+  positive_int num_channels = dim_at_idx(input_shape, relative_ff_dim_t{1});
+  positive_int input_height = dim_at_idx(input_shape, relative_ff_dim_t{2});
+  positive_int input_width = dim_at_idx(input_shape, relative_ff_dim_t{3});
 
-  nonnegative_int output_height =
+  positive_int output_height =
       calculate_output_size(/*input_size=*/input_height,
                             /*padding_size=*/attrs.padding_h,
                             /*kernel_size=*/attrs.kernel_h,
                             /*stride_size=*/attrs.stride_h);
-  nonnegative_int output_width =
+  positive_int output_width =
       calculate_output_size(/*input_size=*/input_width,
                             /*padding_size=*/attrs.padding_w,
                             /*kernel_size=*/attrs.kernel_w,
                             /*stride_size=*/attrs.stride_w);
 
-  return TensorShape{TensorDims{FFOrdered<nonnegative_int>{
+  return TensorShape{TensorDims{FFOrdered<positive_int>{
                          num_samples,
                          num_channels,
                          output_height,
diff --git a/lib/op-attrs/src/op-attrs/ops/reduction.cc b/lib/op-attrs/src/op-attrs/ops/reduction.cc
index 0a9f3e3b97..007559a816 100644
--- a/lib/op-attrs/src/op-attrs/ops/reduction.cc
+++ b/lib/op-attrs/src/op-attrs/ops/reduction.cc
@@ -29,7 +29,10 @@ tl::expected<ParallelTensorShape, std::string>
   }
 
   ParallelTensorShape output_shape = input_shape;
-  output_shape.dims.replica_dims.sum_degree.value /= attrs.reduction_degree;
+  
+  output_shape.dims.replica_dims.sum_degree.value = positive_int{
+    output_shape.dims.replica_dims.sum_degree.value / attrs.reduction_degree
+  };
   return output_shape;
 }
 
diff --git a/lib/op-attrs/src/op-attrs/ops/weight.cc b/lib/op-attrs/src/op-attrs/ops/weight.cc
index 906d2c58d0..710529af0a 100644
--- a/lib/op-attrs/src/op-attrs/ops/weight.cc
+++ b/lib/op-attrs/src/op-attrs/ops/weight.cc
@@ -6,7 +6,7 @@ namespace FlexFlow {
 RecordFormatter as_dot(WeightAttrs const &attrs) {
   RecordFormatter r;
 
-  for (nonnegative_int dim : attrs.tensor_shape.dims.ff_ordered) {
+  for (positive_int dim : attrs.tensor_shape.dims.ff_ordered) {
     r << fmt::to_string(dim);
   }
 
diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
index 3f2245b2dc..8a96bc25ba 100644
--- a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
+++ b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
@@ -19,7 +19,7 @@ FFOrdered<ShardParallelDim> ff_ordered_shard_dims(ParallelTensorDims const &d) {
   return d.shard_dims;
 }
 
-FFOrdered<nonnegative_int>
+FFOrdered<positive_int>
     ff_ordered_shard_degrees(ParallelTensorDims const &d) {
   return transform(d.shard_dims,
                    [](ShardParallelDim const &d) { return d.degree; });
@@ -43,22 +43,22 @@ ParallelTensorDimDegrees get_parallel_degrees(ParallelTensorDims const &d) {
 }
 
 ParallelTensorDims lift_to_parallel(TensorDims const &dims) {
-  std::vector<nonnegative_int> shard_degrees =
-      repeat_element(/*num_times=*/num_dims(dims), /*element=*/1_n);
+  std::vector<positive_int> shard_degrees =
+      repeat_element(/*num_times=*/num_dims(dims), /*element=*/1_p);
   return lift_to_parallel_with_degrees(
-      dims, SumDegree{1_n}, DiscardCopyDegree{1_n}, shard_degrees);
+      dims, SumDegree{1_p}, DiscardCopyDegree{1_p}, shard_degrees);
 }
 
 ParallelTensorDims lift_to_parallel_with_degrees(
     TensorDims const &unpar,
     SumDegree const &sum_degree,
     DiscardCopyDegree const &discard_copy_degree,
-    FFOrdered<nonnegative_int> const &shard_degrees) {
+    FFOrdered<positive_int> const &shard_degrees) {
   std::vector<ShardParallelDim> lifted =
       transform(zip(vector_of(unpar.ff_ordered), vector_of(shard_degrees)),
-                [](std::pair<nonnegative_int, nonnegative_int> const &p) {
-                  nonnegative_int size = p.first;
-                  nonnegative_int degree = p.second;
+                [](std::pair<positive_int, positive_int> const &p) {
+                  positive_int size = p.first;
+                  positive_int degree = p.second;
                   return ShardParallelDim{size, degree};
                 });
 
@@ -78,17 +78,17 @@ ParallelTensorDims
                                        degrees.shard_degrees);
 }
 
-nonnegative_int total_replica_degree(ParallelTensorDims const &dims) {
+positive_int total_replica_degree(ParallelTensorDims const &dims) {
   return dims.replica_dims.discard_copy_degree.value *
          dims.replica_dims.sum_degree.value;
 }
 
-nonnegative_int total_shard_degree(ParallelTensorDims const &dims) {
+positive_int total_shard_degree(ParallelTensorDims const &dims) {
   return product(transform(vector_of(dims.shard_dims),
                            [](ShardParallelDim const &d) { return d.degree; }));
 }
 
-nonnegative_int total_parallel_degree(ParallelTensorDims const &dims) {
+positive_int total_parallel_degree(ParallelTensorDims const &dims) {
   return total_replica_degree(dims) * total_shard_degree(dims);
 }
 
@@ -118,7 +118,7 @@ TensorDims get_tensor_dims_unsafe(ParallelTensorDims const &) {
 }
 
 TensorDims get_reduced_dims(ParallelTensorDims const &dims) {
-  FFOrdered<nonnegative_int> dim_sizes = transform(
+  FFOrdered<positive_int> dim_sizes = transform(
       dims.shard_dims, [](ShardParallelDim const &d) { return d.size; });
   return TensorDims{dim_sizes};
 }
diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc
index 260ec7c3cd..ff6debee4f 100644
--- a/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc
+++ b/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc
@@ -20,21 +20,21 @@ std::unordered_set<ReplicaParallelDim>
   return replica_dims(s.dims);
 }
 
-nonnegative_int get_num_replicas(ParallelTensorShape const &shape) {
+positive_int get_num_replicas(ParallelTensorShape const &shape) {
   return product(transform(
       replica_dims(shape),
-      [](ReplicaParallelDim const &d) -> nonnegative_int { return d.degree; }));
+      [](ReplicaParallelDim const &d) -> positive_int { return d.degree; }));
 }
 
-nonnegative_int get_sum_degree(ParallelTensorShape const &shape) {
+positive_int get_sum_degree(ParallelTensorShape const &shape) {
   return shape.dims.replica_dims.sum_degree.value;
 }
 
-nonnegative_int get_discard_copy_degree(ParallelTensorShape const &shape) {
+positive_int get_discard_copy_degree(ParallelTensorShape const &shape) {
   return shape.dims.replica_dims.discard_copy_degree.value;
 }
 
-nonnegative_int get_total_parallel_degree(ParallelTensorShape const &s) {
+positive_int get_total_parallel_degree(ParallelTensorShape const &s) {
   return total_parallel_degree(s.dims);
 }
 
@@ -52,7 +52,7 @@ ShardParallelDim &shard_dim_at_idx(ParallelTensorShape &s,
   return shard_dim_at_idx(s.dims, d);
 }
 
-FFOrdered<nonnegative_int>
+FFOrdered<positive_int>
     ff_ordered_shard_degrees(ParallelTensorShape const &s) {
   return ff_ordered_shard_degrees(s.dims);
 }
@@ -79,7 +79,7 @@ ParallelTensorShape lift_to_parallel_with_degrees(
     TensorShape const &unpar,
     SumDegree const &sum_degree,
     DiscardCopyDegree const &discard_copy_degree,
-    FFOrdered<nonnegative_int> const &shard_degrees) {
+    FFOrdered<positive_int> const &shard_degrees) {
   return ParallelTensorShape{
       lift_to_parallel_with_degrees(
           unpar.dims, sum_degree, discard_copy_degree, shard_degrees),
@@ -97,8 +97,8 @@ ParallelTensorShape
 }
 
 TensorShape require_not_parallel(ParallelTensorShape const &s) {
-  nonnegative_int total_degree = get_total_parallel_degree(s);
-  if (total_degree != 1_n) {
+  positive_int total_degree = get_total_parallel_degree(s);
+  if (total_degree != 1_p) {
     throw mk_runtime_error(
         fmt::format("Error: require_not_parallel received a parallel tensor "
                     "shape with parallel degree {}: {}",
@@ -132,7 +132,7 @@ ParallelDim get_parallel_dim_at_idx(ParallelTensorShape const &shape,
       },
       [&](ReplicaType replica_type) {
         ReplicaParallelDimSet replicas = shape.dims.replica_dims;
-        nonnegative_int degree = (ReplicaType::SUM == replica_type
+        positive_int degree = (ReplicaType::SUM == replica_type
                                       ? replicas.sum_degree.value
                                       : replicas.discard_copy_degree.value);
         return ParallelDim{ReplicaParallelDim{degree, replica_type}};
diff --git a/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc b/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc
index fc712be10b..41fb988bf7 100644
--- a/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc
+++ b/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc
@@ -4,10 +4,10 @@
 namespace FlexFlow {
 
 ReplicaParallelDimSet empty_replica_parallel_dim_set() {
-  return ReplicaParallelDimSet{SumDegree{1_n}, DiscardCopyDegree{1_n}};
+  return ReplicaParallelDimSet{SumDegree{1_p}, DiscardCopyDegree{1_p}};
 }
 
-nonnegative_int get_degree_of_replica_type(ReplicaParallelDimSet const &s,
+positive_int get_degree_of_replica_type(ReplicaParallelDimSet const &s,
                                            ReplicaType replica_type) {
   switch (replica_type) {
     case ReplicaType::SUM:
@@ -29,8 +29,4 @@ std::unordered_set<ReplicaParallelDim>
   };
 }
 
-bool is_valid(ReplicaParallelDimSet const &s) {
-  return s.sum_degree.value > 0 && s.discard_copy_degree.value > 0;
-}
-
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/tensor_dims.cc b/lib/op-attrs/src/op-attrs/tensor_dims.cc
index 760278297c..b48a23b281 100644
--- a/lib/op-attrs/src/op-attrs/tensor_dims.cc
+++ b/lib/op-attrs/src/op-attrs/tensor_dims.cc
@@ -14,7 +14,7 @@
 
 namespace FlexFlow {
 
-FFOrdered<nonnegative_int> const &ff_ordered(TensorDims const &dims) {
+FFOrdered<positive_int> const &ff_ordered(TensorDims const &dims) {
   return dims.ff_ordered;
 }
 
@@ -22,15 +22,15 @@ nonnegative_int num_dims(TensorDims const &dims) {
   return num_elements(dims.ff_ordered);
 }
 
-nonnegative_int dim_at_idx(TensorDims const &dims, relative_ff_dim_t idx) {
+positive_int dim_at_idx(TensorDims const &dims, relative_ff_dim_t idx) {
   return dims.ff_ordered.at(idx);
 }
 
-nonnegative_int &dim_at_idx(TensorDims &dims, relative_ff_dim_t idx) {
+positive_int &dim_at_idx(TensorDims &dims, relative_ff_dim_t idx) {
   return dims.ff_ordered.at(idx);
 }
 
-nonnegative_int get_num_elements(TensorDims const &d) {
+positive_int get_num_elements(TensorDims const &d) {
   return product(d.ff_ordered);
 }
 
@@ -40,8 +40,8 @@ bool tensor_dims_is_broadcastable_to(TensorDims const &curr,
     return false;
   }
 
-  std::vector<nonnegative_int> curr_dims = vector_of(curr.ff_ordered);
-  std::vector<nonnegative_int> goal_dims = vector_of(goal.ff_ordered);
+  std::vector<positive_int> curr_dims = vector_of(curr.ff_ordered);
+  std::vector<positive_int> goal_dims = vector_of(goal.ff_ordered);
 
   for (auto const &[curr_dim, goal_dim] :
        zip(reversed(curr_dims), reversed(goal_dims))) {
diff --git a/lib/op-attrs/src/op-attrs/tensor_shape.cc b/lib/op-attrs/src/op-attrs/tensor_shape.cc
index afc14af54c..7a1ba810a7 100644
--- a/lib/op-attrs/src/op-attrs/tensor_shape.cc
+++ b/lib/op-attrs/src/op-attrs/tensor_shape.cc
@@ -12,19 +12,19 @@ nonnegative_int num_dims(TensorShape const &s) {
   return num_elements(s.dims.ff_ordered);
 }
 
-nonnegative_int dim_at_idx(TensorShape const &s, relative_ff_dim_t idx) {
+positive_int dim_at_idx(TensorShape const &s, relative_ff_dim_t idx) {
   return dim_at_idx(s.dims, idx);
 }
 
-nonnegative_int &dim_at_idx(TensorShape &s, relative_ff_dim_t idx) {
+positive_int &dim_at_idx(TensorShape &s, relative_ff_dim_t idx) {
   return dim_at_idx(s.dims, idx);
 }
 
-nonnegative_int get_num_elements(TensorShape const &s) {
+positive_int get_num_elements(TensorShape const &s) {
   return get_num_elements(s.dims);
 }
 
-nonnegative_int get_size_in_bytes(TensorShape const &s) {
+positive_int get_size_in_bytes(TensorShape const &s) {
   return get_num_elements(s) * size_of_datatype(s.data_type);
 }
 
diff --git a/lib/op-attrs/test/src/op-attrs/ops/attention.cc b/lib/op-attrs/test/src/op-attrs/ops/attention.cc
index b317c5c69c..a99fe167c7 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/attention.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/attention.cc
@@ -10,10 +10,10 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_attention_incoming_tensor_roles(MultiHeadAttentionAttrs)") {
     auto make_attrs = [](bool bias) {
       return MultiHeadAttentionAttrs{
-          /*embed_dim=*/32_n,
-          /*num_heads=*/10_n,
-          /*kdim=*/32_n,
-          /*vdim=*/32_n,
+          /*embed_dim=*/32_p,
+          /*num_heads=*/10_p,
+          /*kdim=*/32_p,
+          /*vdim=*/32_p,
           /*dropout=*/0.0,
           /*bias=*/bias,
           /*add_bias_kv=*/false,
@@ -58,8 +58,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
   TEST_CASE("get_output_shape(MultiHeadAttentionAttrs, TensorShape, "
             "TensorShape, TensorShape)") {
-    nonnegative_int embed_dim = 32_n;
-    nonnegative_int num_heads = 10_n;
+    positive_int embed_dim = 32_p;
+    positive_int num_heads = 10_p;
 
     /* Parameter meanings match those at
      * https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html
@@ -75,13 +75,13 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*add_zero_attn=*/false,
     };
 
-    nonnegative_int batch_size = 40_n;
-    nonnegative_int seq_len = 48_n;
-    nonnegative_int feature_size = 36_n;
+    positive_int batch_size = 40_p;
+    positive_int seq_len = 48_p;
+    positive_int feature_size = 36_p;
 
     TensorShape input_q = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
+            FFOrdered{
                 batch_size,
                 seq_len,
                 feature_size,
@@ -92,7 +92,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_k = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
+            FFOrdered{
                 batch_size,
                 seq_len,
                 feature_size,
@@ -103,7 +103,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_v = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
+            FFOrdered{
                 batch_size,
                 seq_len,
                 feature_size,
@@ -114,7 +114,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape output = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
+            FFOrdered{
                 batch_size,
                 seq_len,
                 attrs.embed_dim,
@@ -125,8 +125,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape weights = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
-                (feature_size * embed_dim) * 3_n + (embed_dim * embed_dim),
+            FFOrdered{
+                (feature_size * embed_dim) * 3_p + (embed_dim * embed_dim),
                 num_heads,
             },
         },
@@ -135,8 +135,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_bias = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
-                embed_dim * 3_n,
+            FFOrdered{
+                embed_dim * 3_p,
             },
         },
         DataType::FLOAT,
@@ -144,7 +144,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape output_bias = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
+            FFOrdered{
                 embed_dim,
             },
         },
@@ -184,94 +184,94 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("parallel shape inference") {
       auto make_q = [&](SumDegree o_sum,
                         DiscardCopyDegree o_eq,
-                        nonnegative_int o_batch,
-                        nonnegative_int o_seq_len,
-                        nonnegative_int o_q) {
+                        positive_int o_batch,
+                        positive_int o_seq_len,
+                        positive_int o_q) {
         return lift_to_parallel_with_degrees(
             input_q,
             o_sum,
             o_eq,
-            FFOrdered<nonnegative_int>{o_batch, o_seq_len, o_q});
+            FFOrdered{o_batch, o_seq_len, o_q});
       };
 
       auto make_k = [&](SumDegree o_sum,
                         DiscardCopyDegree o_eq,
-                        nonnegative_int o_batch,
-                        nonnegative_int o_seq_len,
-                        nonnegative_int o_k) {
+                        positive_int o_batch,
+                        positive_int o_seq_len,
+                        positive_int o_k) {
         return lift_to_parallel_with_degrees(
             input_k,
             o_sum,
             o_eq,
-            FFOrdered<nonnegative_int>{o_batch, o_seq_len, o_k});
+            FFOrdered{o_batch, o_seq_len, o_k});
       };
 
       auto make_v = [&](SumDegree o_sum,
                         DiscardCopyDegree o_eq,
-                        nonnegative_int o_batch,
-                        nonnegative_int o_seq_len,
-                        nonnegative_int o_v) {
+                        positive_int o_batch,
+                        positive_int o_seq_len,
+                        positive_int o_v) {
         return lift_to_parallel_with_degrees(
             input_v,
             o_sum,
             o_eq,
-            FFOrdered<nonnegative_int>{o_batch, o_seq_len, o_v});
+            FFOrdered{o_batch, o_seq_len, o_v});
       };
 
       auto make_o = [&](SumDegree o_sum,
                         DiscardCopyDegree o_eq,
-                        nonnegative_int o_batch,
-                        nonnegative_int o_seq_len,
-                        nonnegative_int o_o) {
+                        positive_int o_batch,
+                        positive_int o_seq_len,
+                        positive_int o_o) {
         return lift_to_parallel_with_degrees(
             output,
             o_sum,
             o_eq,
-            FFOrdered<nonnegative_int>{o_batch, o_seq_len, o_o});
+            FFOrdered{o_batch, o_seq_len, o_o});
       };
 
       auto make_w = [&](SumDegree o_sum,
                         DiscardCopyDegree o_eq,
-                        nonnegative_int o_e,
-                        nonnegative_int o_h) {
+                        positive_int o_e,
+                        positive_int o_h) {
         return lift_to_parallel_with_degrees(
-            weights, o_sum, o_eq, FFOrdered<nonnegative_int>{o_e, o_h});
+            weights, o_sum, o_eq, FFOrdered{o_e, o_h});
       };
 
       auto make_input_bias = [&](SumDegree o_sum,
                                  DiscardCopyDegree o_eq,
-                                 nonnegative_int o_in_proj_channel) {
+                                 positive_int o_in_proj_channel) {
         return lift_to_parallel_with_degrees(
             input_bias,
             o_sum,
             o_eq,
-            FFOrdered<nonnegative_int>{o_in_proj_channel});
+            FFOrdered{o_in_proj_channel});
       };
 
       auto make_output_bias = [&](SumDegree o_sum,
                                   DiscardCopyDegree o_eq,
-                                  nonnegative_int o_out_proj_channel) {
+                                  positive_int o_out_proj_channel) {
         return lift_to_parallel_with_degrees(
             output_bias,
             o_sum,
             o_eq,
-            FFOrdered<nonnegative_int>{o_out_proj_channel});
+            FFOrdered{o_out_proj_channel});
       };
 
       SUBCASE("data parallelism") {
-        nonnegative_int o_b = 4_n;
+        positive_int o_b = 4_p;
         ParallelTensorShape q =
-            make_q(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n);
+            make_q(SumDegree{1_p}, DiscardCopyDegree{1_p}, o_b, 1_p, 1_p);
         ParallelTensorShape k =
-            make_k(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n);
+            make_k(SumDegree{1_p}, DiscardCopyDegree{1_p}, o_b, 1_p, 1_p);
         ParallelTensorShape v =
-            make_v(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n);
+            make_v(SumDegree{1_p}, DiscardCopyDegree{1_p}, o_b, 1_p, 1_p);
 
         SUBCASE("get_output_shape") {
           tl::expected<ParallelTensorShape, std::string> result =
               get_output_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_o(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n);
+              make_o(SumDegree{1_p}, DiscardCopyDegree{1_p}, o_b, 1_p, 1_p);
           CHECK(result == correct);
         }
 
@@ -279,7 +279,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           tl::expected<ParallelTensorShape, std::string> result =
               get_weights_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_w(SumDegree{1_n}, DiscardCopyDegree{o_b}, 1_n, 1_n);
+              make_w(SumDegree{1_p}, DiscardCopyDegree{o_b}, 1_p, 1_p);
           CHECK(result == correct);
         }
 
@@ -287,7 +287,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           tl::expected<ParallelTensorShape, std::string> result =
               get_input_bias_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_input_bias(SumDegree{1_n}, DiscardCopyDegree{o_b}, 1_n);
+              make_input_bias(SumDegree{1_p}, DiscardCopyDegree{o_b}, 1_p);
           CHECK(result == correct);
         }
 
@@ -295,25 +295,25 @@ TEST_SUITE(FF_TEST_SUITE) {
           tl::expected<ParallelTensorShape, std::string> result =
               get_output_bias_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_output_bias(SumDegree{1_n}, DiscardCopyDegree{o_b}, 1_n);
+              make_output_bias(SumDegree{1_p}, DiscardCopyDegree{o_b}, 1_p);
           CHECK(result == correct);
         }
       }
 
       SUBCASE("attention head parallelism") {
-        nonnegative_int o_h = 2_n;
+        positive_int o_h = 2_p;
         ParallelTensorShape q =
-            make_q(SumDegree{1_n}, DiscardCopyDegree{o_h}, 1_n, 1_n, 1_n);
+            make_q(SumDegree{1_p}, DiscardCopyDegree{o_h}, 1_p, 1_p, 1_p);
         ParallelTensorShape k =
-            make_k(SumDegree{1_n}, DiscardCopyDegree{o_h}, 1_n, 1_n, 1_n);
+            make_k(SumDegree{1_p}, DiscardCopyDegree{o_h}, 1_p, 1_p, 1_p);
         ParallelTensorShape v =
-            make_v(SumDegree{1_n}, DiscardCopyDegree{o_h}, 1_n, 1_n, 1_n);
+            make_v(SumDegree{1_p}, DiscardCopyDegree{o_h}, 1_p, 1_p, 1_p);
 
         SUBCASE("get_output_shape") {
           tl::expected<ParallelTensorShape, std::string> result =
               get_output_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_o(SumDegree{o_h}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
+              make_o(SumDegree{o_h}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p);
           CHECK(result == correct);
         }
 
@@ -321,7 +321,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           tl::expected<ParallelTensorShape, std::string> result =
               get_weights_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_w(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, o_h);
+              make_w(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, o_h);
           CHECK(result == correct);
         }
 
@@ -329,7 +329,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           tl::expected<ParallelTensorShape, std::string> result =
               get_input_bias_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_input_bias(SumDegree{1_n}, DiscardCopyDegree{o_h}, 1_n);
+              make_input_bias(SumDegree{1_p}, DiscardCopyDegree{o_h}, 1_p);
           CHECK(result == correct);
         }
 
@@ -337,26 +337,26 @@ TEST_SUITE(FF_TEST_SUITE) {
           tl::expected<ParallelTensorShape, std::string> result =
               get_output_bias_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_output_bias(SumDegree{1_n}, DiscardCopyDegree{o_h}, 1_n);
+              make_output_bias(SumDegree{1_p}, DiscardCopyDegree{o_h}, 1_p);
           CHECK(result == correct);
         }
       }
 
       SUBCASE("combined data & attention head parallelism") {
-        nonnegative_int o_b = 4_n;
-        nonnegative_int o_h = 2_n;
+        positive_int o_b = 4_p;
+        positive_int o_h = 2_p;
         ParallelTensorShape q =
-            make_q(SumDegree{1_n}, DiscardCopyDegree{o_h}, o_b, 1_n, 1_n);
+            make_q(SumDegree{1_p}, DiscardCopyDegree{o_h}, o_b, 1_p, 1_p);
         ParallelTensorShape k =
-            make_k(SumDegree{1_n}, DiscardCopyDegree{o_h}, o_b, 1_n, 1_n);
+            make_k(SumDegree{1_p}, DiscardCopyDegree{o_h}, o_b, 1_p, 1_p);
         ParallelTensorShape v =
-            make_v(SumDegree{1_n}, DiscardCopyDegree{o_h}, o_b, 1_n, 1_n);
+            make_v(SumDegree{1_p}, DiscardCopyDegree{o_h}, o_b, 1_p, 1_p);
 
         SUBCASE("get_output_shape") {
           tl::expected<ParallelTensorShape, std::string> result =
               get_output_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_o(SumDegree{o_h}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n);
+              make_o(SumDegree{o_h}, DiscardCopyDegree{1_p}, o_b, 1_p, 1_p);
           CHECK(result == correct);
         }
 
@@ -364,7 +364,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           tl::expected<ParallelTensorShape, std::string> result =
               get_weights_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_w(SumDegree{1_n}, DiscardCopyDegree{o_b}, 1_n, o_h);
+              make_w(SumDegree{1_p}, DiscardCopyDegree{o_b}, 1_p, o_h);
           CHECK(result == correct);
         }
 
@@ -373,7 +373,7 @@ TEST_SUITE(FF_TEST_SUITE) {
               get_input_bias_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
               make_input_bias(
-                  SumDegree{1_n}, DiscardCopyDegree{o_b * o_h}, 1_n);
+                  SumDegree{1_p}, DiscardCopyDegree{o_b * o_h}, 1_p);
           CHECK(result == correct);
         }
 
@@ -382,7 +382,7 @@ TEST_SUITE(FF_TEST_SUITE) {
               get_output_bias_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
               make_output_bias(
-                  SumDegree{1_n}, DiscardCopyDegree{o_b * o_h}, 1_n);
+                  SumDegree{1_p}, DiscardCopyDegree{o_b * o_h}, 1_p);
           CHECK(result == correct);
         }
       }
diff --git a/lib/op-attrs/test/src/op-attrs/ops/batch_matmul.cc b/lib/op-attrs/test/src/op-attrs/ops/batch_matmul.cc
index 27c59ee497..d251fb731d 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/batch_matmul.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/batch_matmul.cc
@@ -6,10 +6,10 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_output_shape(BatchMatmulAttrs, TensorShape)") {
-    nonnegative_int b = 4_n;
-    nonnegative_int m = 6_n;
-    nonnegative_int n = 8_n;
-    nonnegative_int p = 10_n;
+    positive_int b = 4_p;
+    positive_int m = 6_p;
+    positive_int n = 8_p;
+    positive_int p = 10_p;
 
     BatchMatmulAttrs attrs = BatchMatmulAttrs{
         /*a_seq_length_dim=*/0_n, // TODO figure out if these arguments are
@@ -19,7 +19,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_lhs_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
+            FFOrdered{
                 b,
                 n,
                 m,
@@ -31,7 +31,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("valid") {
       TensorShape input_rhs_shape = TensorShape{
           TensorDims{
-              FFOrdered<nonnegative_int>{
+              FFOrdered{
                   b,
                   m,
                   p,
@@ -45,7 +45,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       tl::expected<TensorShape, std::string> correct_output_shape = TensorShape{
           TensorDims{
-              FFOrdered<nonnegative_int>{
+              FFOrdered{
                   b,
                   n,
                   p,
@@ -60,8 +60,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("mismatched b") {
       TensorShape input_rhs_shape = TensorShape{
           TensorDims{
-              FFOrdered<nonnegative_int>{
-                  b + 1_n,
+              FFOrdered{
+                  b + 1_p,
                   m,
                   p,
               },
@@ -78,9 +78,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("mismatched m") {
       TensorShape input_rhs_shape = TensorShape{
           TensorDims{
-              FFOrdered<nonnegative_int>{
+              FFOrdered{
                   b,
-                  m + 1_n,
+                  m + 1_p,
                   p,
               },
           },
@@ -95,15 +95,15 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("get_output_shape(BatchMatmulAttrs, ParallelTensorShape)") {
-    nonnegative_int b = 2_n * 2_n;
-    nonnegative_int o_b = 2_n;
-    nonnegative_int m = 3_n * 3_n;
-    nonnegative_int o_m = 3_n;
-    nonnegative_int n = 5_n * 5_n;
-    nonnegative_int o_n = 5_n;
-    nonnegative_int p = 7_n * 7_n;
-    nonnegative_int o_p = 7_n;
-    nonnegative_int o_sum = 11_n;
+    positive_int b = 2_p * 2_p;
+    positive_int o_b = 2_p;
+    positive_int m = 3_p * 3_p;
+    positive_int o_m = 3_p;
+    positive_int n = 5_p * 5_p;
+    positive_int o_n = 5_p;
+    positive_int p = 7_p * 7_p;
+    positive_int o_p = 7_p;
+    positive_int o_sum = 11_p;
 
     BatchMatmulAttrs attrs = BatchMatmulAttrs{
         /*a_seq_length_dim=*/0_n, // TODO figure out if these arguments are
@@ -113,9 +113,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto make_lhs = [&](SumDegree o_sum,
                         DiscardCopyDegree o_eq,
-                        nonnegative_int o_b,
-                        nonnegative_int o_n,
-                        nonnegative_int o_m) {
+                        positive_int o_b,
+                        positive_int o_n,
+                        positive_int o_m) {
       return ParallelTensorShape{
           ParallelTensorDims{
               FFOrdered<ShardParallelDim>{
@@ -134,9 +134,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto make_rhs = [&](SumDegree o_sum,
                         DiscardCopyDegree o_eq,
-                        nonnegative_int o_b,
-                        nonnegative_int o_m,
-                        nonnegative_int o_p) {
+                        positive_int o_b,
+                        positive_int o_m,
+                        positive_int o_p) {
       return ParallelTensorShape{
           ParallelTensorDims{
               FFOrdered<ShardParallelDim>{
@@ -155,9 +155,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto make_output = [&](SumDegree o_sum,
                            DiscardCopyDegree o_eq,
-                           nonnegative_int o_b,
-                           nonnegative_int o_n,
-                           nonnegative_int o_p) {
+                           positive_int o_b,
+                           positive_int o_n,
+                           positive_int o_p) {
       return ParallelTensorShape{
           ParallelTensorDims{
               FFOrdered<ShardParallelDim>{
@@ -177,10 +177,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("data parallel") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n),
-          make_rhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n));
+          make_lhs(SumDegree{1_p}, DiscardCopyDegree{1_p}, o_b, 1_p, 1_p),
+          make_rhs(SumDegree{1_p}, DiscardCopyDegree{1_p}, o_b, 1_p, 1_p));
       tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n);
+          make_output(SumDegree{1_p}, DiscardCopyDegree{1_p}, o_b, 1_p, 1_p);
 
       CHECK(result == correct);
     }
@@ -188,10 +188,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("n parallel") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, o_n, 1_n),
-          make_rhs(SumDegree{1_n}, DiscardCopyDegree{o_n}, 1_n, 1_n, 1_n));
+          make_lhs(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, o_n, 1_p),
+          make_rhs(SumDegree{1_p}, DiscardCopyDegree{o_n}, 1_p, 1_p, 1_p));
       tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, o_n, 1_n);
+          make_output(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, o_n, 1_p);
 
       CHECK(result == correct);
     }
@@ -199,10 +199,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("p parallel") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{1_n}, DiscardCopyDegree{o_p}, 1_n, 1_n, 1_n),
-          make_rhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, o_p));
+          make_lhs(SumDegree{1_p}, DiscardCopyDegree{o_p}, 1_p, 1_p, 1_p),
+          make_rhs(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, 1_p, o_p));
       tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, o_p);
+          make_output(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, 1_p, o_p);
 
       CHECK(result == correct);
     }
@@ -210,10 +210,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("reduction parallel") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, o_m),
-          make_rhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, o_m, 1_n));
+          make_lhs(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, 1_p, o_m),
+          make_rhs(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, o_m, 1_p));
       tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{o_m}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
+          make_output(SumDegree{o_m}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p);
 
       CHECK(result == correct);
     }
@@ -221,10 +221,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("propagate reduction lhs") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
-          make_rhs(SumDegree{1_n}, DiscardCopyDegree{o_sum}, 1_n, 1_n, 1_n));
+          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p),
+          make_rhs(SumDegree{1_p}, DiscardCopyDegree{o_sum}, 1_p, 1_p, 1_p));
       tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
+          make_output(SumDegree{o_sum}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p);
 
       CHECK(result == correct);
     }
@@ -232,10 +232,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("propagate reduction rhs") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{1_n}, DiscardCopyDegree{o_sum}, 1_n, 1_n, 1_n),
-          make_rhs(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n));
+          make_lhs(SumDegree{1_p}, DiscardCopyDegree{o_sum}, 1_p, 1_p, 1_p),
+          make_rhs(SumDegree{o_sum}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p));
       tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
+          make_output(SumDegree{o_sum}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p);
 
       CHECK(result == correct);
     }
@@ -243,10 +243,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("reduction lhs & reduction rhs") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_n, 1_n, 1_n),
-          make_rhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_n, 1_n, 1_n));
+          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_p, 1_p, 1_p),
+          make_rhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_p, 1_p, 1_p));
       tl::expected<ParallelTensorShape, std::string> correct = make_output(
-          SumDegree{o_sum * o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
+          SumDegree{o_sum * o_sum}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p);
 
       CHECK(result == correct);
     }
@@ -254,8 +254,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("reduction lhs & rhs (invalid)") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
-          make_rhs(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n));
+          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p),
+          make_rhs(SumDegree{o_sum}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p));
 
       CHECK_MESSAGE(
           !result.has_value(), "Unexpected successful value: ", result);
@@ -264,11 +264,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("reduction lhs & n") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, o_n, 1_n),
+          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1_p}, 1_p, o_n, 1_p),
           make_rhs(
-              SumDegree{1_n}, DiscardCopyDegree{o_sum * o_n}, 1_n, 1_n, 1_n));
+              SumDegree{1_p}, DiscardCopyDegree{o_sum * o_n}, 1_p, 1_p, 1_p));
       tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, o_n, 1_n);
+          make_output(SumDegree{o_sum}, DiscardCopyDegree{1_p}, 1_p, o_n, 1_p);
 
       CHECK(result == correct);
     }
@@ -276,11 +276,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("reduction lhs & reduction rhs & n") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_n, o_n, 1_n),
+          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_p, o_n, 1_p),
           make_rhs(
-              SumDegree{o_sum}, DiscardCopyDegree{o_sum * o_n}, 1_n, 1_n, 1_n));
+              SumDegree{o_sum}, DiscardCopyDegree{o_sum * o_n}, 1_p, 1_p, 1_p));
       tl::expected<ParallelTensorShape, std::string> correct = make_output(
-          SumDegree{o_sum * o_sum}, DiscardCopyDegree{1_n}, 1_n, o_n, 1_n);
+          SumDegree{o_sum * o_sum}, DiscardCopyDegree{1_p}, 1_p, o_n, 1_p);
 
       CHECK(result == correct);
     }
@@ -288,15 +288,15 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("reduction lhs & reduction rhs & n & m") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_n, o_n, o_m),
+          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_p, o_n, o_m),
           make_rhs(
-              SumDegree{o_sum}, DiscardCopyDegree{o_sum * o_n}, 1_n, o_m, 1_n));
+              SumDegree{o_sum}, DiscardCopyDegree{o_sum * o_n}, 1_p, o_m, 1_p));
       tl::expected<ParallelTensorShape, std::string> correct =
           make_output(SumDegree{o_sum * o_sum * o_m},
-                      DiscardCopyDegree{1_n},
-                      1_n,
+                      DiscardCopyDegree{1_p},
+                      1_p,
                       o_n,
-                      1_n);
+                      1_p);
 
       CHECK(result == correct);
     }
diff --git a/lib/op-attrs/test/src/op-attrs/ops/batch_norm.cc b/lib/op-attrs/test/src/op-attrs/ops/batch_norm.cc
index cd9796945c..b70e8fcb4e 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/batch_norm.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/batch_norm.cc
@@ -60,11 +60,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     }();
 
     TensorShape input = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{
-            12_n,
-            14_n,
-            16_n,
-            18_n,
+        TensorDims{FFOrdered{
+            12_p,
+            14_p,
+            16_p,
+            18_p,
         }},
         DataType::FLOAT,
     };
@@ -72,8 +72,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorShape output = input;
 
     TensorShape gamma = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{
-            14_n,
+        TensorDims{FFOrdered{
+            14_p,
         }},
         DataType::FLOAT,
     };
@@ -140,16 +140,16 @@ TEST_SUITE(FF_TEST_SUITE) {
     }();
 
     SUBCASE("partition parallelism (in channel dim)") {
-      nonnegative_int degree = 2_n;
+      positive_int degree = 2_p;
 
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{1_n},
-          DiscardCopyDegree{1_n},
-          FFOrdered<nonnegative_int>{
-              1_n,
+          SumDegree{1_p},
+          DiscardCopyDegree{1_p},
+          FFOrdered{
+              1_p,
               degree,
-              1_n,
-              1_n,
+              1_p,
+              1_p,
           },
       };
 
@@ -169,9 +169,9 @@ TEST_SUITE(FF_TEST_SUITE) {
               get_gamma_weights_parallel_dim_degrees(attrs_affine_true, input);
           tl::expected<ParallelTensorDimDegrees, std::string> correct =
               ParallelTensorDimDegrees{
-                  SumDegree{1_n},
-                  DiscardCopyDegree{1_n},
-                  FFOrdered<nonnegative_int>{degree},
+                  SumDegree{1_p},
+                  DiscardCopyDegree{1_p},
+                  FFOrdered{degree},
               };
 
           CHECK(result == correct);
@@ -194,9 +194,9 @@ TEST_SUITE(FF_TEST_SUITE) {
               get_beta_weights_parallel_dim_degrees(attrs_affine_true, input);
           tl::expected<ParallelTensorDimDegrees, std::string> correct =
               ParallelTensorDimDegrees{
-                  SumDegree{1_n},
-                  DiscardCopyDegree{1_n},
-                  FFOrdered<nonnegative_int>{degree},
+                  SumDegree{1_p},
+                  DiscardCopyDegree{1_p},
+                  FFOrdered{degree},
               };
 
           CHECK(result == correct);
@@ -214,12 +214,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("partition parallelism (not in channel dim)") {
-      nonnegative_int degree = 2_n;
+      positive_int degree = 2_p;
 
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{1_n},
-          DiscardCopyDegree{1_n},
-          FFOrdered<nonnegative_int>{1_n, 1_n, degree, 1_n},
+          SumDegree{1_p},
+          DiscardCopyDegree{1_p},
+          FFOrdered{1_p, 1_p, degree, 1_p},
       };
 
       SUBCASE("get_output_parallel_dim_degrees(BatchNormAttrs, "
@@ -251,12 +251,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("sum parallelism") {
-      SumDegree sum_degree = SumDegree{2_n};
+      SumDegree sum_degree = SumDegree{2_p};
 
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
           sum_degree,
-          DiscardCopyDegree{1_n},
-          FFOrdered<nonnegative_int>{1_n, 1_n, 1_n, 1_n},
+          DiscardCopyDegree{1_p},
+          FFOrdered{1_p, 1_p, 1_p, 1_p},
       };
 
       SUBCASE("get_output_parallel_dim_degrees(BatchNormAttrs, "
@@ -288,12 +288,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("discard copy parallelism") {
-      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_n};
+      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_p};
 
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{1_n},
+          SumDegree{1_p},
           discard_copy_degree,
-          FFOrdered<nonnegative_int>{1_n, 1_n, 1_n, 1_n},
+          FFOrdered{1_p, 1_p, 1_p, 1_p},
       };
 
       SUBCASE("get_output_parallel_dim_degrees(BatchNormAttrs, "
@@ -340,14 +340,14 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{12_n, 1_n},
-                ShardParallelDim{14_n, 2_n},
-                ShardParallelDim{16_n, 1_n},
-                ShardParallelDim{18_n, 1_n},
+                ShardParallelDim{12_p, 1_p},
+                ShardParallelDim{14_p, 2_p},
+                ShardParallelDim{16_p, 1_p},
+                ShardParallelDim{18_p, 1_p},
             },
             ReplicaParallelDimSet{
-                SumDegree{1_n},
-                DiscardCopyDegree{1_n},
+                SumDegree{1_p},
+                DiscardCopyDegree{1_p},
             },
         },
         DataType::FLOAT,
@@ -368,11 +368,11 @@ TEST_SUITE(FF_TEST_SUITE) {
           ParallelTensorShape{
               ParallelTensorDims{
                   FFOrdered<ShardParallelDim>{
-                      ShardParallelDim{14_n, 2_n},
+                      ShardParallelDim{14_p, 2_p},
                   },
                   ReplicaParallelDimSet{
-                      SumDegree{1_n},
-                      DiscardCopyDegree{1_n},
+                      SumDegree{1_p},
+                      DiscardCopyDegree{1_p},
                   },
               },
               DataType::FLOAT,
@@ -388,11 +388,11 @@ TEST_SUITE(FF_TEST_SUITE) {
           ParallelTensorShape{
               ParallelTensorDims{
                   FFOrdered<ShardParallelDim>{
-                      ShardParallelDim{14_n, 2_n},
+                      ShardParallelDim{14_p, 2_p},
                   },
                   ReplicaParallelDimSet{
-                      SumDegree{1_n},
-                      DiscardCopyDegree{1_n},
+                      SumDegree{1_p},
+                      DiscardCopyDegree{1_p},
                   },
               },
               DataType::FLOAT,
diff --git a/lib/op-attrs/test/src/op-attrs/ops/cast.cc b/lib/op-attrs/test/src/op-attrs/ops/cast.cc
index e9ec890b4b..eeba779dfe 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/cast.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/cast.cc
@@ -12,15 +12,15 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     CastAttrs attrs = CastAttrs{output_datatype};
 
-    nonnegative_int d1 = 12_n;
-    nonnegative_int d2 = 16_n;
+    positive_int d1 = 12_p;
+    positive_int d2 = 16_p;
     TensorShape input = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{d1, d2}},
+        TensorDims{FFOrdered{d1, d2}},
         input_datatype,
     };
 
     TensorShape output = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{d1, d2}},
+        TensorDims{FFOrdered{d1, d2}},
         output_datatype,
     };
 
@@ -34,30 +34,30 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("get_output_shape(CastAttrs, ParallelTensorShape)") {
       auto make_input = [&](SumDegree o_sum,
                             DiscardCopyDegree o_eq,
-                            nonnegative_int o_batch,
-                            nonnegative_int o_features) {
+                            positive_int o_batch,
+                            positive_int o_features) {
         return lift_to_parallel_with_degrees(
             input,
             o_sum,
             o_eq,
-            FFOrdered<nonnegative_int>{o_batch, o_features});
+            FFOrdered{o_batch, o_features});
       };
 
       auto make_output = [&](SumDegree o_sum,
                              DiscardCopyDegree o_eq,
-                             nonnegative_int o_batch,
-                             nonnegative_int o_outchannels) {
+                             positive_int o_batch,
+                             positive_int o_outchannels) {
         return lift_to_parallel_with_degrees(
             output,
             o_sum,
             o_eq,
-            FFOrdered<nonnegative_int>{o_batch, o_outchannels});
+            FFOrdered{o_batch, o_outchannels});
       };
 
-      SumDegree sum_degree = SumDegree{2_n};
-      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{3_n};
-      nonnegative_int batch_degree = 4_n;
-      nonnegative_int feature_degree = 8_n;
+      SumDegree sum_degree = SumDegree{2_p};
+      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{3_p};
+      positive_int batch_degree = 4_p;
+      positive_int feature_degree = 8_p;
       ParallelTensorShape par_input = make_input(
           sum_degree, discard_copy_degree, batch_degree, feature_degree);
 
diff --git a/lib/op-attrs/test/src/op-attrs/ops/combine.cc b/lib/op-attrs/test/src/op-attrs/ops/combine.cc
index 14fbca5b3a..07520e7cce 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/combine.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/combine.cc
@@ -10,14 +10,14 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{12_n, 2_n},
-                ShardParallelDim{14_n, 1_n},
-                ShardParallelDim{16_n, 3_n},
-                ShardParallelDim{18_n, 2_n},
+                ShardParallelDim{12_p, 2_p},
+                ShardParallelDim{14_p, 1_p},
+                ShardParallelDim{16_p, 3_p},
+                ShardParallelDim{18_p, 2_p},
             },
             ReplicaParallelDimSet{
-                SumDegree{3_n},
-                DiscardCopyDegree{2_n},
+                SumDegree{3_p},
+                DiscardCopyDegree{2_p},
             },
         },
         DataType::FLOAT,
@@ -25,7 +25,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("valid") {
       ff_dim_t dim = ff_dim_t{2_n};
-      nonnegative_int degree = 3_n;
+      positive_int degree = 3_p;
       CombineAttrs attrs = CombineAttrs{
           /*repartition_dim=*/dim,
           /*repartition_degree=*/degree,
@@ -36,7 +36,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       tl::expected<ParallelTensorShape, std::string> correct = [&] {
         ParallelTensorShape output = input;
-        output.dims.shard_dims.at(dim).degree /= degree;
+        positive_int old_shard_degree = output.dims.shard_dims.at(dim).degree;
+        output.dims.shard_dims.at(dim).degree = positive_int{old_shard_degree / degree};
         return output;
       }();
 
@@ -45,7 +46,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("invalid") {
       ff_dim_t dim = ff_dim_t{2_n};
-      nonnegative_int degree = 4_n;
+      positive_int degree = 4_p;
       CombineAttrs attrs = CombineAttrs{
           /*repartition_dim=*/dim,
           /*repartition_degree=*/degree,
diff --git a/lib/op-attrs/test/src/op-attrs/ops/concat.cc b/lib/op-attrs/test/src/op-attrs/ops/concat.cc
index b84cf38753..ee1255161c 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/concat.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/concat.cc
@@ -10,7 +10,7 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_output_shape(ConcatAttrs, std::vector<TensorShape>)") {
     ConcatAttrs attrs = ConcatAttrs{
-        ff_dim_t{nonnegative_int{1}},
+        ff_dim_t{1_n},
     };
 
     SUBCASE("empty input shapes list passed") {
@@ -23,12 +23,12 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(result == correct);
     }
 
-    nonnegative_int dim0_size = 12_n;
-    nonnegative_int dim2_size = 20_n;
+    positive_int dim0_size = 12_p;
+    positive_int dim2_size = 20_p;
     TensorShape input_shape1 = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{
+        TensorDims{FFOrdered{
             dim0_size,
-            14_n,
+            14_p,
             dim2_size,
         }},
         DataType::FLOAT,
@@ -45,26 +45,26 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     TensorShape input_shape2 = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{
+        TensorDims{FFOrdered{
             dim0_size,
-            16_n,
+            16_p,
             dim2_size,
         }},
         DataType::FLOAT,
     };
 
     TensorShape input_shape3 = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{dim0_size, 18_n, dim2_size}},
+        TensorDims{FFOrdered{dim0_size, 18_p, dim2_size}},
         DataType::FLOAT,
     };
 
     SUBCASE("input shapes do not shared the same num_dims") {
       TensorShape mismatched_num_dims = TensorShape{
-          TensorDims{FFOrdered<nonnegative_int>{
+          TensorDims{FFOrdered{
               dim0_size,
-              20_n,
+              20_p,
               dim2_size,
-              1_n,
+              1_p,
           }},
           DataType::FLOAT,
       };
@@ -81,7 +81,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("concat axis is out of bounds") {
       attrs = ConcatAttrs{
-          ff_dim_t{nonnegative_int{3}},
+          ff_dim_t{3_n},
       };
 
       std::vector<TensorShape> input_shapes = {
@@ -101,9 +101,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       tl::expected<TensorShape, std::string> result =
           get_output_shape(attrs, input_shapes);
       tl::expected<TensorShape, std::string> correct = TensorShape{
-          TensorDims{FFOrdered<nonnegative_int>{
+          TensorDims{FFOrdered{
               dim0_size,
-              14_n + 16_n + 18_n,
+              14_p + 16_p + 18_p,
               dim2_size,
           }},
           DataType::FLOAT,
@@ -115,100 +115,100 @@ TEST_SUITE(FF_TEST_SUITE) {
 
   TEST_CASE("get_output_shape(ConcatAttrs, std::vector<ParallelTensorShape>)") {
     ConcatAttrs attrs = ConcatAttrs{
-        ff_dim_t{nonnegative_int{1}},
+        ff_dim_t{1_n},
     };
 
-    nonnegative_int dim0_size = 12_n;
-    nonnegative_int dim2_size = 20_n;
+    positive_int dim0_size = 12_p;
+    positive_int dim2_size = 20_p;
 
     TensorShape input_shape1 = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{
+        TensorDims{FFOrdered{
             dim0_size,
-            14_n,
+            14_p,
             dim2_size,
         }},
         DataType::FLOAT,
     };
 
     TensorShape input_shape2 = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{
+        TensorDims{FFOrdered{
             dim0_size,
-            16_n,
+            16_p,
             dim2_size,
         }},
         DataType::FLOAT,
     };
 
     TensorShape input_shape3 = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{dim0_size, 18_n, dim2_size}},
+        TensorDims{FFOrdered{dim0_size, 18_p, dim2_size}},
         DataType::FLOAT,
     };
 
     TensorShape output_shape = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{
-            dim0_size, 14_n + 16_n + 18_n, dim2_size}},
+        TensorDims{FFOrdered{
+            dim0_size, 14_p + 16_p + 18_p, dim2_size}},
         DataType::FLOAT,
     };
 
     auto lift_input1 = [&](SumDegree o_sum,
                            DiscardCopyDegree o_eq,
-                           nonnegative_int o0,
-                           nonnegative_int o1,
-                           nonnegative_int o2) {
+                           positive_int o0,
+                           positive_int o1,
+                           positive_int o2) {
       return lift_to_parallel_with_degrees(
-          input_shape1, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2});
+          input_shape1, o_sum, o_eq, FFOrdered{o0, o1, o2});
     };
 
     auto lift_input2 = [&](SumDegree o_sum,
                            DiscardCopyDegree o_eq,
-                           nonnegative_int o0,
-                           nonnegative_int o1,
-                           nonnegative_int o2) {
+                           positive_int o0,
+                           positive_int o1,
+                           positive_int o2) {
       return lift_to_parallel_with_degrees(
-          input_shape2, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2});
+          input_shape2, o_sum, o_eq, FFOrdered{o0, o1, o2});
     };
 
     auto lift_input3 = [&](SumDegree o_sum,
                            DiscardCopyDegree o_eq,
-                           nonnegative_int o0,
-                           nonnegative_int o1,
-                           nonnegative_int o2) {
+                           positive_int o0,
+                           positive_int o1,
+                           positive_int o2) {
       return lift_to_parallel_with_degrees(
-          input_shape3, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2});
+          input_shape3, o_sum, o_eq, FFOrdered{o0, o1, o2});
     };
 
     auto lift_output = [&](SumDegree o_sum,
                            DiscardCopyDegree o_eq,
-                           nonnegative_int o0,
-                           nonnegative_int o1,
-                           nonnegative_int o2) {
+                           positive_int o0,
+                           positive_int o1,
+                           positive_int o2) {
       return lift_to_parallel_with_degrees(
-          output_shape, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2});
+          output_shape, o_sum, o_eq, FFOrdered{o0, o1, o2});
     };
 
     SUBCASE("sum reduction parallelism") {
       SUBCASE("matching") {
-        SumDegree sum_degree = SumDegree{2_n};
+        SumDegree sum_degree = SumDegree{2_p};
 
         std::vector<ParallelTensorShape> inputs = {
-            lift_input1(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
-            lift_input2(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
-            lift_input3(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
+            lift_input1(sum_degree, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p),
+            lift_input2(sum_degree, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p),
+            lift_input3(sum_degree, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p),
         };
 
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, inputs);
         tl::expected<ParallelTensorShape, std::string> correct =
-            lift_output(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
+            lift_output(sum_degree, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p);
 
         CHECK(result == correct);
       }
 
       SUBCASE("not matching") {
         std::vector<ParallelTensorShape> inputs = {
-            lift_input1(SumDegree{2_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
-            lift_input2(SumDegree{4_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
-            lift_input3(SumDegree{4_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
+            lift_input1(SumDegree{2_p}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p),
+            lift_input2(SumDegree{4_p}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p),
+            lift_input3(SumDegree{4_p}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p),
         };
 
         std::optional<ParallelTensorShape> result =
@@ -221,27 +221,27 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("discard copy reduction parallelism") {
       SUBCASE("matching") {
-        DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_n};
+        DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_p};
 
         std::vector<ParallelTensorShape> inputs = {
-            lift_input1(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n),
-            lift_input2(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n),
-            lift_input3(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n),
+            lift_input1(SumDegree{1_p}, discard_copy_degree, 1_p, 1_p, 1_p),
+            lift_input2(SumDegree{1_p}, discard_copy_degree, 1_p, 1_p, 1_p),
+            lift_input3(SumDegree{1_p}, discard_copy_degree, 1_p, 1_p, 1_p),
         };
 
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, inputs);
         tl::expected<ParallelTensorShape, std::string> correct =
-            lift_output(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n);
+            lift_output(SumDegree{1_p}, discard_copy_degree, 1_p, 1_p, 1_p);
 
         CHECK(result == correct);
       }
 
       SUBCASE("not matching") {
         std::vector<ParallelTensorShape> inputs = {
-            lift_input1(SumDegree{1_n}, DiscardCopyDegree{2_n}, 1_n, 1_n, 1_n),
-            lift_input2(SumDegree{1_n}, DiscardCopyDegree{2_n}, 1_n, 1_n, 1_n),
-            lift_input3(SumDegree{1_n}, DiscardCopyDegree{4_n}, 1_n, 1_n, 1_n),
+            lift_input1(SumDegree{1_p}, DiscardCopyDegree{2_p}, 1_p, 1_p, 1_p),
+            lift_input2(SumDegree{1_p}, DiscardCopyDegree{2_p}, 1_p, 1_p, 1_p),
+            lift_input3(SumDegree{1_p}, DiscardCopyDegree{4_p}, 1_p, 1_p, 1_p),
         };
 
         std::optional<ParallelTensorShape> result =
@@ -254,15 +254,15 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("parallelism in axis dim") {
       SUBCASE("matching") {
-        nonnegative_int degree = 2_n;
+        positive_int degree = 2_p;
 
         std::vector<ParallelTensorShape> inputs = {
             lift_input1(
-                SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n),
+                SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, degree, 1_p),
             lift_input2(
-                SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n),
+                SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, degree, 1_p),
             lift_input3(
-                SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n),
+                SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, degree, 1_p),
         };
 
         std::optional<ParallelTensorShape> result =
@@ -274,9 +274,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       SUBCASE("not matching") {
         std::vector<ParallelTensorShape> inputs = {
-            lift_input1(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
-            lift_input2(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
-            lift_input3(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 2_n, 1_n),
+            lift_input1(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p),
+            lift_input2(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p),
+            lift_input3(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, 2_p, 1_p),
         };
 
         std::optional<ParallelTensorShape> result =
@@ -289,31 +289,31 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("parallelism in non-axis shard dims") {
       SUBCASE("matching") {
-        nonnegative_int degree0 = 2_n;
-        nonnegative_int degree2 = 4_n;
+        positive_int degree0 = 2_p;
+        positive_int degree2 = 4_p;
 
         std::vector<ParallelTensorShape> inputs = {
             lift_input1(
-                SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2),
+                SumDegree{1_p}, DiscardCopyDegree{1_p}, degree0, 1_p, degree2),
             lift_input2(
-                SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2),
+                SumDegree{1_p}, DiscardCopyDegree{1_p}, degree0, 1_p, degree2),
             lift_input3(
-                SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2),
+                SumDegree{1_p}, DiscardCopyDegree{1_p}, degree0, 1_p, degree2),
         };
 
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, inputs);
         tl::expected<ParallelTensorShape, std::string> correct = lift_output(
-            SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2);
+            SumDegree{1_p}, DiscardCopyDegree{1_p}, degree0, 1_p, degree2);
 
         CHECK(result == correct);
       }
 
       SUBCASE("not matching") {
         std::vector<ParallelTensorShape> inputs = {
-            lift_input1(SumDegree{1_n}, DiscardCopyDegree{1_n}, 2_n, 1_n, 4_n),
-            lift_input2(SumDegree{1_n}, DiscardCopyDegree{1_n}, 4_n, 1_n, 2_n),
-            lift_input3(SumDegree{1_n}, DiscardCopyDegree{1_n}, 4_n, 1_n, 2_n),
+            lift_input1(SumDegree{1_p}, DiscardCopyDegree{1_p}, 2_p, 1_p, 4_p),
+            lift_input2(SumDegree{1_p}, DiscardCopyDegree{1_p}, 4_p, 1_p, 2_p),
+            lift_input3(SumDegree{1_p}, DiscardCopyDegree{1_p}, 4_p, 1_p, 2_p),
         };
 
         std::optional<ParallelTensorShape> result =
@@ -325,21 +325,21 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("parallelism degrees are not mutually exclusive") {
-      SumDegree sum_degree = SumDegree{3_n};
-      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{5_n};
-      nonnegative_int degree0 = 2_n;
-      nonnegative_int degree2 = 4_n;
+      SumDegree sum_degree = SumDegree{3_p};
+      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{5_p};
+      positive_int degree0 = 2_p;
+      positive_int degree2 = 4_p;
 
       std::vector<ParallelTensorShape> inputs = {
-          lift_input1(sum_degree, discard_copy_degree, degree0, 1_n, degree2),
-          lift_input2(sum_degree, discard_copy_degree, degree0, 1_n, degree2),
-          lift_input3(sum_degree, discard_copy_degree, degree0, 1_n, degree2),
+          lift_input1(sum_degree, discard_copy_degree, degree0, 1_p, degree2),
+          lift_input2(sum_degree, discard_copy_degree, degree0, 1_p, degree2),
+          lift_input3(sum_degree, discard_copy_degree, degree0, 1_p, degree2),
       };
 
       tl::expected<ParallelTensorShape, std::string> result =
           get_output_shape(attrs, inputs);
       tl::expected<ParallelTensorShape, std::string> correct =
-          lift_output(sum_degree, discard_copy_degree, degree0, 1_n, degree2);
+          lift_output(sum_degree, discard_copy_degree, degree0, 1_p, degree2);
 
       CHECK(result == correct);
     }
diff --git a/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc b/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc
index f5006d4352..67b6bbadb8 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc
@@ -7,14 +7,14 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_conv2d_incoming_tensor_roles(Conv2DAttrs") {
     auto make_attrs = [](bool use_bias) {
-      return Conv2DAttrs{/*out_channels=*/4_n,
-                         /*kernel_h=*/3_n,
-                         /*kernel_w=*/2_n,
-                         /*stride_h=*/2_n,
-                         /*stride_w=*/2_n,
+      return Conv2DAttrs{/*out_channels=*/4_p,
+                         /*kernel_h=*/3_p,
+                         /*kernel_w=*/2_p,
+                         /*stride_h=*/2_p,
+                         /*stride_w=*/2_p,
                          /*padding_h=*/1_n,
                          /*padding_w=*/1_n,
-                         /*groups=*/1_n,
+                         /*groups=*/1_p,
                          /*activation=*/std::nullopt,
                          /*use_bias=*/use_bias};
     };
@@ -48,14 +48,14 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("Conv2D shape inference") {
-    nonnegative_int out_channels = 4_n;
-    nonnegative_int kernel_h = 3_n;
-    nonnegative_int kernel_w = 2_n;
-    nonnegative_int stride_h = 2_n;
-    nonnegative_int stride_w = 2_n;
+    positive_int out_channels = 4_p;
+    positive_int kernel_h = 3_p;
+    positive_int kernel_w = 2_p;
+    positive_int stride_h = 2_p;
+    positive_int stride_w = 2_p;
     nonnegative_int padding_h = 1_n;
     nonnegative_int padding_w = 1_n;
-    nonnegative_int groups = 1_n;
+    positive_int groups = 1_p;
     std::optional<Activation> activation = std::nullopt;
     bool use_bias = true;
 
@@ -72,13 +72,13 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*use_bias=*/true,
     };
 
-    nonnegative_int num_samples = 7_n;
-    nonnegative_int input_channels = 4_n;
-    nonnegative_int input_height = 11_n;
-    nonnegative_int input_width = 15_n;
+    positive_int num_samples = 7_p;
+    positive_int input_channels = 4_p;
+    positive_int input_height = 11_p;
+    positive_int input_width = 15_p;
 
     TensorShape input = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{
+        TensorDims{FFOrdered{
             num_samples,
             input_channels,
             input_height,
@@ -87,11 +87,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         DataType::FLOAT,
     };
 
-    nonnegative_int output_height = 6_n;
-    nonnegative_int output_width = 8_n;
+    positive_int output_height = 6_p;
+    positive_int output_width = 8_p;
 
     TensorShape output = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{
+        TensorDims{FFOrdered{
             num_samples,
             out_channels,
             output_height,
@@ -101,7 +101,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     TensorShape kernel = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{
+        TensorDims{FFOrdered{
             out_channels,
             input_channels,
             kernel_h,
@@ -111,7 +111,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     TensorShape bias = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{
+        TensorDims{FFOrdered{
             out_channels,
         }},
         DataType::FLOAT,
@@ -137,149 +137,149 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto make_input = [&](SumDegree o_sum,
                           DiscardCopyDegree o_eq,
-                          nonnegative_int o_n,
-                          nonnegative_int o_c,
-                          nonnegative_int o_h,
-                          nonnegative_int o_w) {
+                          positive_int o_n,
+                          positive_int o_c,
+                          positive_int o_h,
+                          positive_int o_w) {
       return lift_to_parallel_with_degrees(
-          input, o_sum, o_eq, FFOrdered<nonnegative_int>{o_n, o_c, o_h, o_w});
+          input, o_sum, o_eq, FFOrdered{o_n, o_c, o_h, o_w});
     };
 
     auto make_output = [&](SumDegree o_sum,
                            DiscardCopyDegree o_eq,
-                           nonnegative_int o_n,
-                           nonnegative_int o_c,
-                           nonnegative_int o_h,
-                           nonnegative_int o_w) {
+                           positive_int o_n,
+                           positive_int o_c,
+                           positive_int o_h,
+                           positive_int o_w) {
       return lift_to_parallel_with_degrees(
-          output, o_sum, o_eq, FFOrdered<nonnegative_int>{o_n, o_c, o_h, o_w});
+          output, o_sum, o_eq, FFOrdered{o_n, o_c, o_h, o_w});
     };
 
     auto make_kernel = [&](SumDegree o_sum,
                            DiscardCopyDegree o_eq,
-                           nonnegative_int o_outchannels,
-                           nonnegative_int o_inchannels,
-                           nonnegative_int o_kernel_h,
-                           nonnegative_int o_kernel_w) {
+                           positive_int o_outchannels,
+                           positive_int o_inchannels,
+                           positive_int o_kernel_h,
+                           positive_int o_kernel_w) {
       return lift_to_parallel_with_degrees(
           kernel,
           o_sum,
           o_eq,
-          FFOrdered<nonnegative_int>{
+          FFOrdered{
               o_outchannels, o_inchannels, o_kernel_h, o_kernel_w});
     };
 
     auto make_bias = [&](SumDegree o_sum,
                          DiscardCopyDegree o_eq,
-                         nonnegative_int o_outchannels) {
+                         positive_int o_outchannels) {
       return lift_to_parallel_with_degrees(
-          bias, o_sum, o_eq, FFOrdered<nonnegative_int>{o_outchannels});
+          bias, o_sum, o_eq, FFOrdered{o_outchannels});
     };
 
     SUBCASE("data parallelism") {
-      nonnegative_int degree = 2_n;
+      positive_int degree = 2_p;
       ParallelTensorShape par_input = make_input(
-          SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n, 1_n);
+          SumDegree{1_p}, DiscardCopyDegree{1_p}, degree, 1_p, 1_p, 1_p);
 
       SUBCASE("get_output_shape") {
         ParallelTensorShape result = get_output_shape(attrs, par_input);
         ParallelTensorShape correct = make_output(
-            SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n, 1_n);
+            SumDegree{1_p}, DiscardCopyDegree{1_p}, degree, 1_p, 1_p, 1_p);
         CHECK(result == correct);
       }
 
       SUBCASE("get_kernel_shape") {
         ParallelTensorShape result = get_kernel_shape(attrs, par_input);
         ParallelTensorShape correct = make_kernel(
-            SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n, 1_n);
+            SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p, 1_p, 1_p, 1_p);
         CHECK(result == correct);
       }
 
       SUBCASE("get_bias_shape") {
         ParallelTensorShape result = get_bias_shape(attrs, par_input);
         ParallelTensorShape correct =
-            make_bias(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n);
+            make_bias(SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p);
         CHECK(result == correct);
       }
     }
 
     SUBCASE("input channel parallelism") {
-      nonnegative_int degree = 2_n;
+      positive_int degree = 2_p;
       ParallelTensorShape par_input = make_input(
-          SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n, 1_n);
+          SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, degree, 1_p, 1_p);
 
       SUBCASE("get_output_shape") {
         ParallelTensorShape result = get_output_shape(attrs, par_input);
         ParallelTensorShape correct = make_output(
-            SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n, 1_n);
+            SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p, 1_p);
         CHECK(result == correct);
       }
 
       SUBCASE("get_kernel_shape") {
         ParallelTensorShape result = get_kernel_shape(attrs, par_input);
         ParallelTensorShape correct = make_kernel(
-            SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n, 1_n);
+            SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, degree, 1_p, 1_p);
         CHECK(result == correct);
       }
 
       SUBCASE("get_bias_shape") {
         ParallelTensorShape result = get_bias_shape(attrs, par_input);
         ParallelTensorShape correct =
-            make_bias(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n);
+            make_bias(SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p);
         CHECK(result == correct);
       }
     }
 
     SUBCASE("output channel parallelism") {
-      nonnegative_int degree = 2_n;
+      positive_int degree = 2_p;
       ParallelTensorShape par_input = make_input(
-          SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n, 1_n);
+          SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p, 1_p, 1_p, 1_p);
 
       SUBCASE("get_output_shape") {
         ParallelTensorShape result = get_output_shape(attrs, par_input);
         ParallelTensorShape correct = make_output(
-            SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n, 1_n);
+            SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, degree, 1_p, 1_p);
         CHECK(result == correct);
       }
 
       SUBCASE("get_kernel_shape") {
         ParallelTensorShape result = get_kernel_shape(attrs, par_input);
         ParallelTensorShape correct = make_kernel(
-            SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n, 1_n);
+            SumDegree{1_p}, DiscardCopyDegree{1_p}, degree, 1_p, 1_p, 1_p);
         CHECK(result == correct);
       }
 
       SUBCASE("get_bias_shape") {
         ParallelTensorShape result = get_bias_shape(attrs, par_input);
         ParallelTensorShape correct =
-            make_bias(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree);
+            make_bias(SumDegree{1_p}, DiscardCopyDegree{1_p}, degree);
         CHECK(result == correct);
       }
     }
 
     SUBCASE("propagating sum degree") {
-      nonnegative_int degree = 2_n;
+      positive_int degree = 2_p;
       ParallelTensorShape par_input = make_input(
-          SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n, 1_n);
+          SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p, 1_p);
 
       SUBCASE("get_output_shape") {
         ParallelTensorShape result = get_output_shape(attrs, par_input);
         ParallelTensorShape correct = make_output(
-            SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n, 1_n);
+            SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p, 1_p);
         CHECK(result == correct);
       }
 
       SUBCASE("get_kernel_shape") {
         ParallelTensorShape result = get_kernel_shape(attrs, par_input);
         ParallelTensorShape correct = make_kernel(
-            SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n, 1_n);
+            SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p, 1_p, 1_p, 1_p);
         CHECK(result == correct);
       }
 
       SUBCASE("get_bias_shape") {
         ParallelTensorShape result = get_bias_shape(attrs, par_input);
         ParallelTensorShape correct =
-            make_bias(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n);
+            make_bias(SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p);
         CHECK(result == correct);
       }
     }
diff --git a/lib/op-attrs/test/src/op-attrs/ops/dropout.cc b/lib/op-attrs/test/src/op-attrs/ops/dropout.cc
index e1a03a7613..194a93387b 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/dropout.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/dropout.cc
@@ -15,10 +15,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     TensorShape input = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{
-            12_n,
-            14_n,
-            16_n,
+        TensorDims{FFOrdered{
+            12_p,
+            14_p,
+            16_p,
         }},
         DataType::FLOAT,
     };
@@ -36,10 +36,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     TensorShape input = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{
-            12_n,
-            14_n,
-            16_n,
+        TensorDims{FFOrdered{
+            12_p,
+            14_p,
+            16_p,
         }},
         DataType::FLOAT,
     };
@@ -48,42 +48,42 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto make_input = [&](SumDegree o_sum,
                           DiscardCopyDegree o_eq,
-                          nonnegative_int o0,
-                          nonnegative_int o1,
-                          nonnegative_int o2) {
+                          positive_int o0,
+                          positive_int o1,
+                          positive_int o2) {
       return lift_to_parallel_with_degrees(
-          input, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2});
+          input, o_sum, o_eq, FFOrdered{o0, o1, o2});
     };
 
     auto make_output = [&](SumDegree o_sum,
                            DiscardCopyDegree o_eq,
-                           nonnegative_int o0,
-                           nonnegative_int o1,
-                           nonnegative_int o2) {
+                           positive_int o0,
+                           positive_int o1,
+                           positive_int o2) {
       return lift_to_parallel_with_degrees(
-          output, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2});
+          output, o_sum, o_eq, FFOrdered{o0, o1, o2});
     };
 
     SUBCASE("partition parallelism (allowed)") {
-      nonnegative_int degree0 = 2_n;
-      nonnegative_int degree2 = 4_n;
+      positive_int degree0 = 2_p;
+      positive_int degree2 = 4_p;
 
       ParallelTensorShape par_input = make_input(
-          SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2);
+          SumDegree{1_p}, DiscardCopyDegree{1_p}, degree0, 1_p, degree2);
 
       tl::expected<ParallelTensorShape, std::string> result =
           get_output_shape(attrs, par_input);
       tl::expected<ParallelTensorShape, std::string> correct = make_output(
-          SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2);
+          SumDegree{1_p}, DiscardCopyDegree{1_p}, degree0, 1_p, degree2);
 
       CHECK(result == correct);
     }
 
     SUBCASE("sum parallelism (not allowed)") {
-      SumDegree sum_degree = SumDegree{2_n};
+      SumDegree sum_degree = SumDegree{2_p};
 
       ParallelTensorShape par_input =
-          make_input(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
+          make_input(sum_degree, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p);
 
       std::optional<ParallelTensorShape> result =
           optional_from_expected(get_output_shape(attrs, par_input));
@@ -93,10 +93,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("discard copy parallelism (not allowed)") {
-      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_n};
+      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_p};
 
       ParallelTensorShape par_input =
-          make_input(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n);
+          make_input(SumDegree{1_p}, discard_copy_degree, 1_p, 1_p, 1_p);
 
       std::optional<ParallelTensorShape> result =
           optional_from_expected(get_output_shape(attrs, par_input));
diff --git a/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc b/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc
index d6a92036f0..4ef34c666e 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc
@@ -7,9 +7,9 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("EWAdd shape inference") {
-    nonnegative_int d1 = 16_n;
-    nonnegative_int d2 = 32_n;
-    nonnegative_int d3 = 24_n;
+    positive_int d1 = 16_p;
+    positive_int d2 = 32_p;
+    positive_int d3 = 24_p;
 
     ElementBinaryAttrs attrs = ElementBinaryAttrs{
         OperatorType::EW_ADD,
@@ -20,7 +20,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_lhs = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
+            FFOrdered{
                 d1,
                 d2,
                 d3,
@@ -41,7 +41,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("mismatched dim size") {
       TensorShape incorrect_rhs = input_lhs;
-      dim_at_idx(incorrect_rhs, relative_ff_dim_t{0}) += 1_n;
+      dim_at_idx(incorrect_rhs, relative_ff_dim_t{0}) += 1_p;
 
       tl::expected<TensorShape, std::string> result =
           get_output_shape(attrs, input_lhs, incorrect_rhs);
@@ -53,9 +53,9 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("EWAdd parallel shape inference") {
-    nonnegative_int d1 = 16_n;
-    nonnegative_int d2 = 32_n;
-    nonnegative_int d3 = 24_n;
+    positive_int d1 = 16_p;
+    positive_int d2 = 32_p;
+    positive_int d3 = 24_p;
 
     ElementBinaryAttrs attrs = ElementBinaryAttrs{
         OperatorType::EW_ADD,
@@ -66,7 +66,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape unpar_lhs = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
+            FFOrdered{
                 d1,
                 d2,
                 d3,
@@ -83,68 +83,68 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto make_lhs = [&](SumDegree o_sum,
                         DiscardCopyDegree o_eq,
-                        nonnegative_int o_1,
-                        nonnegative_int o_2,
-                        nonnegative_int o_3) {
+                        positive_int o_1,
+                        positive_int o_2,
+                        positive_int o_3) {
       return lift_to_parallel_with_degrees(
-          unpar_lhs, o_sum, o_eq, FFOrdered<nonnegative_int>{o_1, o_2, o_3});
+          unpar_lhs, o_sum, o_eq, FFOrdered{o_1, o_2, o_3});
     };
 
     auto make_rhs = [&](SumDegree o_sum,
                         DiscardCopyDegree o_eq,
-                        nonnegative_int o_1,
-                        nonnegative_int o_2,
-                        nonnegative_int o_3) {
+                        positive_int o_1,
+                        positive_int o_2,
+                        positive_int o_3) {
       return lift_to_parallel_with_degrees(
-          unpar_rhs, o_sum, o_eq, FFOrdered<nonnegative_int>{o_1, o_2, o_3});
+          unpar_rhs, o_sum, o_eq, FFOrdered{o_1, o_2, o_3});
     };
 
     auto make_output = [&](SumDegree o_sum,
                            DiscardCopyDegree o_eq,
-                           nonnegative_int o_1,
-                           nonnegative_int o_2,
-                           nonnegative_int o_3) {
+                           positive_int o_1,
+                           positive_int o_2,
+                           positive_int o_3) {
       return lift_to_parallel_with_degrees(
-          unpar_output, o_sum, o_eq, FFOrdered<nonnegative_int>{o_1, o_2, o_3});
+          unpar_output, o_sum, o_eq, FFOrdered{o_1, o_2, o_3});
     };
 
     SUBCASE("data parallelism") {
-      nonnegative_int degree = 4_n;
+      positive_int degree = 4_p;
 
       ParallelTensorShape input_lhs =
-          make_lhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n);
+          make_lhs(SumDegree{1_p}, DiscardCopyDegree{1_p}, degree, 1_p, 1_p);
       ParallelTensorShape input_rhs =
-          make_rhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n);
+          make_rhs(SumDegree{1_p}, DiscardCopyDegree{1_p}, degree, 1_p, 1_p);
       tl::expected<ParallelTensorShape, std::string> result =
           get_output_shape(attrs, input_lhs, input_rhs);
       tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n);
+          make_output(SumDegree{1_p}, DiscardCopyDegree{1_p}, degree, 1_p, 1_p);
 
       CHECK(result == correct);
     }
 
     SUBCASE("reduction parallelism") {
-      nonnegative_int degree = 4_n;
+      positive_int degree = 4_p;
 
       ParallelTensorShape input_lhs =
-          make_lhs(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
+          make_lhs(SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p);
       ParallelTensorShape input_rhs =
-          make_rhs(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
+          make_rhs(SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p);
       tl::expected<ParallelTensorShape, std::string> result =
           get_output_shape(attrs, input_lhs, input_rhs);
       tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
+          make_output(SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p);
 
       CHECK(result == correct);
     }
 
     SUBCASE("invalid discard copy parallelism") {
-      nonnegative_int degree = 4_n;
+      positive_int degree = 4_p;
 
       ParallelTensorShape input_lhs =
-          make_lhs(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n);
+          make_lhs(SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p, 1_p, 1_p);
       ParallelTensorShape input_rhs =
-          make_rhs(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n);
+          make_rhs(SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p, 1_p, 1_p);
       tl::expected<ParallelTensorShape, std::string> result =
           get_output_shape(attrs, input_lhs, input_rhs);
 
@@ -154,12 +154,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("invalid mismatched parallelism degrees") {
-      nonnegative_int degree = 4_n;
+      positive_int degree = 4_p;
 
       ParallelTensorShape input_lhs =
-          make_lhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n);
+          make_lhs(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, degree, 1_p);
       ParallelTensorShape input_rhs =
-          make_rhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, degree);
+          make_rhs(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, 1_p, degree);
       tl::expected<ParallelTensorShape, std::string> result =
           get_output_shape(attrs, input_lhs, input_rhs);
 
diff --git a/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc b/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc
index bac6efba3f..355feb4c5f 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc
@@ -7,16 +7,16 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ReLU shape inference") {
-    nonnegative_int d1 = 16_n;
-    nonnegative_int d2 = 32_n;
-    nonnegative_int d3 = 24_n;
+    positive_int d1 = 16_p;
+    positive_int d2 = 32_p;
+    positive_int d3 = 24_p;
 
     ElementUnaryAttrs attrs =
         ElementUnaryAttrs{OperatorType::RELU, std::nullopt};
 
     TensorShape input = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
+            FFOrdered{
                 d1,
                 d2,
                 d3,
@@ -33,18 +33,18 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto make_input = [&](SumDegree o_sum,
                           DiscardCopyDegree o_eq,
-                          nonnegative_int o_1,
-                          nonnegative_int o_2,
-                          nonnegative_int o_3) {
+                          positive_int o_1,
+                          positive_int o_2,
+                          positive_int o_3) {
       return lift_to_parallel_with_degrees(
-          input, o_sum, o_eq, FFOrdered<nonnegative_int>{o_1, o_2, o_3});
+          input, o_sum, o_eq, FFOrdered{o_1, o_2, o_3});
     };
 
     SUBCASE("partition i.e., sharding parallelism") {
-      nonnegative_int degree1 = 4_n;
-      nonnegative_int degree2 = 8_n;
+      positive_int degree1 = 4_p;
+      positive_int degree2 = 8_p;
       ParallelTensorShape par_input = make_input(
-          SumDegree{1_n}, DiscardCopyDegree{1_n}, degree1, 1_n, degree2);
+          SumDegree{1_p}, DiscardCopyDegree{1_p}, degree1, 1_p, degree2);
 
       tl::expected<ParallelTensorShape, std::string> result =
           get_output_shape(attrs, par_input);
@@ -54,11 +54,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("sum degree > 1") {
-      nonnegative_int degree = 2_n;
+      positive_int degree = 2_p;
 
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_input(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n));
+          make_input(SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p));
 
       CHECK_MESSAGE(!result.has_value(),
                     "Unexpected successful result: ",
@@ -66,11 +66,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("discard copy degree > 1") {
-      nonnegative_int degree = 2_n;
+      positive_int degree = 2_p;
 
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_input(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n));
+          make_input(SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p, 1_p, 1_p));
 
       CHECK_MESSAGE(!result.has_value(),
                     "Unexpected successful result: ",
diff --git a/lib/op-attrs/test/src/op-attrs/ops/embedding.cc b/lib/op-attrs/test/src/op-attrs/ops/embedding.cc
index 8fe50a4217..7d43b45dd0 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/embedding.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/embedding.cc
@@ -8,8 +8,8 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Sum embedding shape inference") {
-    nonnegative_int out_channels = 128_n;
-    nonnegative_int num_entries = 1024_n;
+    positive_int out_channels = 128_p;
+    positive_int num_entries = 1024_p;
     EmbeddingAttrs attrs = EmbeddingAttrs{
         /*num_entries=*/num_entries,
         /*out_channels=*/out_channels,
@@ -17,11 +17,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*data_type=*/DataType::FLOAT,
     };
 
-    nonnegative_int batch_size = 48_n;
-    nonnegative_int features_dim = 56_n;
+    positive_int batch_size = 48_p;
+    positive_int features_dim = 56_p;
 
     TensorShape input = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{
+        TensorDims{FFOrdered{
             batch_size,
             features_dim,
         }},
@@ -30,7 +30,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape output = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
+            FFOrdered{
                 batch_size,
                 out_channels,
             },
@@ -40,7 +40,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape weights = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
+            FFOrdered{
                 num_entries,
                 out_channels,
             },
@@ -66,44 +66,44 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto make_input = [&](SumDegree o_sum,
                           DiscardCopyDegree o_eq,
-                          nonnegative_int o_batch,
-                          nonnegative_int o_features) {
+                          positive_int o_batch,
+                          positive_int o_features) {
       return lift_to_parallel_with_degrees(
-          input, o_sum, o_eq, FFOrdered<nonnegative_int>{o_batch, o_features});
+          input, o_sum, o_eq, FFOrdered{o_batch, o_features});
     };
 
     auto make_output = [&](SumDegree o_sum,
                            DiscardCopyDegree o_eq,
-                           nonnegative_int o_batch,
-                           nonnegative_int o_outchannels) {
+                           positive_int o_batch,
+                           positive_int o_outchannels) {
       return lift_to_parallel_with_degrees(
           output,
           o_sum,
           o_eq,
-          FFOrdered<nonnegative_int>{o_batch, o_outchannels});
+          FFOrdered{o_batch, o_outchannels});
     };
 
     auto make_weights = [&](SumDegree o_sum,
                             DiscardCopyDegree o_eq,
-                            nonnegative_int o_entries,
-                            nonnegative_int o_outchannels) {
+                            positive_int o_entries,
+                            positive_int o_outchannels) {
       return lift_to_parallel_with_degrees(
           weights,
           o_sum,
           o_eq,
-          FFOrdered<nonnegative_int>{o_entries, o_outchannels});
+          FFOrdered{o_entries, o_outchannels});
     };
 
     SUBCASE("data parallelism") {
-      nonnegative_int degree = 4_n;
+      positive_int degree = 4_p;
       ParallelTensorShape par_input =
-          make_input(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n);
+          make_input(SumDegree{1_p}, DiscardCopyDegree{1_p}, degree, 1_p);
 
       {
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct =
-            make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n);
+            make_output(SumDegree{1_p}, DiscardCopyDegree{1_p}, degree, 1_p);
         CHECK(result == correct);
       }
 
@@ -111,21 +111,21 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<ParallelTensorShape, std::string> result =
             get_weights_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct =
-            make_weights(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n);
+            make_weights(SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p, 1_p);
         CHECK(result == correct);
       }
     }
 
     SUBCASE("input features parallelism") {
-      nonnegative_int degree = 4_n;
+      positive_int degree = 4_p;
       ParallelTensorShape input =
-          make_input(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree);
+          make_input(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, degree);
 
       {
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, input);
         tl::expected<ParallelTensorShape, std::string> correct =
-            make_output(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n);
+            make_output(SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p, 1_p);
         CHECK(result == correct);
       }
 
@@ -133,7 +133,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<ParallelTensorShape, std::string> result =
             get_weights_shape(attrs, input);
         tl::expected<ParallelTensorShape, std::string> correct =
-            make_weights(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n);
+            make_weights(SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p, 1_p);
         CHECK(result == correct);
       }
     }
@@ -145,15 +145,15 @@ TEST_SUITE(FF_TEST_SUITE) {
       // dimension. For now we choose to represent parallelism in the channel
       // dimension, but partitioning in the entry dimension is also potentially
       // useful as it produces sum parallelism in the output
-      nonnegative_int degree = 4_n;
+      positive_int degree = 4_p;
       ParallelTensorShape input =
-          make_input(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n);
+          make_input(SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p, 1_p);
 
       {
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, input);
         tl::expected<ParallelTensorShape, std::string> correct =
-            make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree);
+            make_output(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, degree);
         CHECK(result == correct);
       }
 
@@ -161,7 +161,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<ParallelTensorShape, std::string> result =
             get_weights_shape(attrs, input);
         tl::expected<ParallelTensorShape, std::string> correct =
-            make_weights(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree);
+            make_weights(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, degree);
         CHECK(result == correct);
       }
     }
diff --git a/lib/op-attrs/test/src/op-attrs/ops/flat.cc b/lib/op-attrs/test/src/op-attrs/ops/flat.cc
index ebd869b3e5..c4fe8a5250 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/flat.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/flat.cc
@@ -9,11 +9,11 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_output_shape(FlatAttrs, TensorShape)") {
     TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{
-            2_n,
-            4_n,
-            2_n,
-            3_n,
+        TensorDims{FFOrdered{
+            2_p,
+            4_p,
+            2_p,
+            3_p,
         }},
         DataType::FLOAT,
     };
@@ -26,8 +26,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       TensorShape result = get_output_shape(attrs, input_shape);
       TensorShape correct = TensorShape{
-          TensorDims{FFOrdered<nonnegative_int>{
-              2_n * 4_n * 2_n * 3_n,
+          TensorDims{FFOrdered{
+              2_p * 4_p * 2_p * 3_p,
           }},
           DataType::FLOAT,
       };
@@ -37,16 +37,16 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("flatten trailing dims") {
       FlatAttrs attrs = FlatAttrs{
-          /*start_dim=*/ff_dim_t{nonnegative_int{2}},
-          /*end_dim=*/ff_dim_t{nonnegative_int{4}},
+          /*start_dim=*/ff_dim_t{2_n},
+          /*end_dim=*/ff_dim_t{4_n},
       };
 
       TensorShape result = get_output_shape(attrs, input_shape);
       TensorShape correct = TensorShape{
-          TensorDims{FFOrdered<nonnegative_int>{
-              2_n,
-              4_n,
-              2_n * 3_n,
+          TensorDims{FFOrdered{
+              2_p,
+              4_p,
+              2_p * 3_p,
           }},
           DataType::FLOAT,
       };
@@ -56,16 +56,16 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("flatten leading dims") {
       FlatAttrs attrs = FlatAttrs{
-          /*start_dim=*/ff_dim_t{nonnegative_int{0}},
-          /*end_dim=*/ff_dim_t{nonnegative_int{2}},
+          /*start_dim=*/ff_dim_t{0_n},
+          /*end_dim=*/ff_dim_t{2_n},
       };
 
       TensorShape result = get_output_shape(attrs, input_shape);
       TensorShape correct = TensorShape{
-          TensorDims{FFOrdered<nonnegative_int>{
-              2_n * 4_n,
-              2_n,
-              3_n,
+          TensorDims{FFOrdered{
+              2_p * 4_p,
+              2_p,
+              3_p,
           }},
           DataType::FLOAT,
       };
@@ -75,16 +75,16 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("flatten middle dims") {
       FlatAttrs attrs = FlatAttrs{
-          /*start_dim=*/ff_dim_t{nonnegative_int{1}},
-          /*end_dim=*/ff_dim_t{nonnegative_int{3}},
+          /*start_dim=*/ff_dim_t{1_n},
+          /*end_dim=*/ff_dim_t{3_n},
       };
 
       TensorShape result = get_output_shape(attrs, input_shape);
       TensorShape correct = TensorShape{
-          TensorDims{FFOrdered<nonnegative_int>{
-              2_n,
-              4_n * 2_n,
-              3_n,
+          TensorDims{FFOrdered{
+              2_p,
+              4_p * 2_p,
+              3_p,
           }},
           DataType::FLOAT,
       };
@@ -94,8 +94,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("flatten no dims (start_dim == end_dim)") {
       FlatAttrs attrs = FlatAttrs{
-          /*start_dim=*/ff_dim_t{nonnegative_int{2}},
-          /*end_dim=*/ff_dim_t{nonnegative_int{2}},
+          /*start_dim=*/ff_dim_t{2_n},
+          /*end_dim=*/ff_dim_t{2_n},
       };
 
       TensorShape result = get_output_shape(attrs, input_shape);
@@ -106,8 +106,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("flatten no dims (start_dim < end_dim)") {
       FlatAttrs attrs = FlatAttrs{
-          /*start_dim=*/ff_dim_t{nonnegative_int{2}},
-          /*end_dim=*/ff_dim_t{nonnegative_int{1}},
+          /*start_dim=*/ff_dim_t{2_n},
+          /*end_dim=*/ff_dim_t{1_n},
       };
 
       TensorShape result = get_output_shape(attrs, input_shape);
@@ -119,23 +119,23 @@ TEST_SUITE(FF_TEST_SUITE) {
 
   TEST_CASE(
       "get_output_parallel_dim_degrees(FlatAttrs, ParallelTensorDimDegrees)") {
-    FlatAttrs attrs = FlatAttrs{/*start_dim=*/ff_dim_t{nonnegative_int{1}},
-                                /*end_dim=*/ff_dim_t{nonnegative_int{3}}};
+    FlatAttrs attrs = FlatAttrs{/*start_dim=*/ff_dim_t{1_n},
+                                /*end_dim=*/ff_dim_t{3_n}};
 
     SUBCASE("allows shard parallelism in non-flattened dims") {
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{1_n},
-          DiscardCopyDegree{1_n},
-          FFOrdered<nonnegative_int>{2_n, 1_n, 1_n, 3_n},
+          SumDegree{1_p},
+          DiscardCopyDegree{1_p},
+          FFOrdered{2_p, 1_p, 1_p, 3_p},
       };
 
       tl::expected<ParallelTensorDimDegrees, std::string> result =
           get_output_parallel_dim_degrees(attrs, input);
       tl::expected<ParallelTensorDimDegrees, std::string> correct =
           ParallelTensorDimDegrees{
-              SumDegree{1_n},
-              DiscardCopyDegree{1_n},
-              FFOrdered<nonnegative_int>{2_n, 1_n, 3_n},
+              SumDegree{1_p},
+              DiscardCopyDegree{1_p},
+              FFOrdered{2_p, 1_p, 3_p},
           };
 
       CHECK(result == correct);
@@ -143,9 +143,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("does not allow shard parallelism in flattened dims") {
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{1_n},
-          DiscardCopyDegree{1_n},
-          FFOrdered<nonnegative_int>{1_n, 1_n, 2_n, 1_n},
+          SumDegree{1_p},
+          DiscardCopyDegree{1_p},
+          FFOrdered{1_p, 1_p, 2_p, 1_p},
       };
 
       std::optional<ParallelTensorDimDegrees> result =
@@ -157,18 +157,18 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("allows sum parallelism") {
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{2_n},
-          DiscardCopyDegree{1_n},
-          FFOrdered<nonnegative_int>{1_n, 1_n, 1_n, 1_n},
+          SumDegree{2_p},
+          DiscardCopyDegree{1_p},
+          FFOrdered{1_p, 1_p, 1_p, 1_p},
       };
 
       std::optional<ParallelTensorDimDegrees> result =
           optional_from_expected(get_output_parallel_dim_degrees(attrs, input));
       std::optional<ParallelTensorDimDegrees> correct =
           ParallelTensorDimDegrees{
-              SumDegree{2_n},
-              DiscardCopyDegree{1_n},
-              FFOrdered<nonnegative_int>{1_n, 1_n, 1_n},
+              SumDegree{2_p},
+              DiscardCopyDegree{1_p},
+              FFOrdered{1_p, 1_p, 1_p},
           };
 
       CHECK(result == correct);
@@ -176,18 +176,18 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("allows discard copy parallelism") {
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{1_n},
-          DiscardCopyDegree{2_n},
-          FFOrdered<nonnegative_int>{1_n, 1_n, 1_n, 1_n},
+          SumDegree{1_p},
+          DiscardCopyDegree{2_p},
+          FFOrdered{1_p, 1_p, 1_p, 1_p},
       };
 
       std::optional<ParallelTensorDimDegrees> result =
           optional_from_expected(get_output_parallel_dim_degrees(attrs, input));
       std::optional<ParallelTensorDimDegrees> correct =
           ParallelTensorDimDegrees{
-              SumDegree{1_n},
-              DiscardCopyDegree{2_n},
-              FFOrdered<nonnegative_int>{1_n, 1_n, 1_n},
+              SumDegree{1_p},
+              DiscardCopyDegree{2_p},
+              FFOrdered{1_p, 1_p, 1_p},
           };
 
       CHECK(result == correct);
@@ -203,22 +203,22 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{4_n, 2_n},
-                ShardParallelDim{8_n, 1_n},
-                ShardParallelDim{6_n, 1_n},
-                ShardParallelDim{9_n, 3_n},
+                ShardParallelDim{4_p, 2_p},
+                ShardParallelDim{8_p, 1_p},
+                ShardParallelDim{6_p, 1_p},
+                ShardParallelDim{9_p, 3_p},
             },
             ReplicaParallelDimSet{
-                SumDegree{7_n},
-                DiscardCopyDegree{5_n},
+                SumDegree{7_p},
+                DiscardCopyDegree{5_p},
             },
         },
         DataType::FLOAT,
     };
 
     FlatAttrs attrs = FlatAttrs{
-        /*start_dim=*/ff_dim_t{nonnegative_int{1_n}},
-        /*end_dim=*/ff_dim_t{nonnegative_int{3_n}},
+        /*start_dim=*/ff_dim_t{nonnegative_int{1_p}},
+        /*end_dim=*/ff_dim_t{nonnegative_int{3_p}},
     };
 
     tl::expected<ParallelTensorShape, std::string> result =
@@ -227,13 +227,13 @@ TEST_SUITE(FF_TEST_SUITE) {
         ParallelTensorShape{
             ParallelTensorDims{
                 FFOrdered<ShardParallelDim>{
-                    ShardParallelDim{4_n, 2_n},
-                    ShardParallelDim{8_n * 6_n, 1_n},
-                    ShardParallelDim{9_n, 3_n},
+                    ShardParallelDim{4_p, 2_p},
+                    ShardParallelDim{8_p * 6_p, 1_p},
+                    ShardParallelDim{9_p, 3_p},
                 },
                 ReplicaParallelDimSet{
-                    SumDegree{7_n},
-                    DiscardCopyDegree{5_n},
+                    SumDegree{7_p},
+                    DiscardCopyDegree{5_p},
                 },
             },
             DataType::FLOAT,
diff --git a/lib/op-attrs/test/src/op-attrs/ops/layer_norm.cc b/lib/op-attrs/test/src/op-attrs/ops/layer_norm.cc
index b9aa3c0677..ba311ffb1a 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/layer_norm.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/layer_norm.cc
@@ -11,7 +11,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_layer_norm_incoming_tensor_roles(LayerNormAttrs)") {
     auto make_attrs = [](bool elementwise_affine) {
       return LayerNormAttrs{
-          /*axes=*/{ff_dim_t{nonnegative_int{0}}, ff_dim_t{nonnegative_int{2}}},
+          /*axes=*/{ff_dim_t{0_n}, ff_dim_t{2_n}},
           elementwise_affine,
           /*eps=*/1.0,
       };
@@ -46,7 +46,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
   TEST_CASE("shape inference (LayerNorm)") {
     LayerNormAttrs attrs_affine_true = LayerNormAttrs{
-        /*axes=*/{ff_dim_t{nonnegative_int{1}}, ff_dim_t{nonnegative_int{3}}},
+        /*axes=*/{ff_dim_t{1_n}, ff_dim_t{3_n}},
         /*elementwise_affine=*/true,
         /*eps=*/0.1,
     };
@@ -58,11 +58,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     }();
 
     TensorShape input = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{
-            12_n,
-            14_n,
-            16_n,
-            18_n,
+        TensorDims{FFOrdered{
+            12_p,
+            14_p,
+            16_p,
+            18_p,
         }},
         DataType::FLOAT,
     };
@@ -70,9 +70,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorShape output = input;
 
     TensorShape gamma = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{
-            12_n,
-            16_n,
+        TensorDims{FFOrdered{
+            12_p,
+            16_p,
         }},
         DataType::FLOAT,
     };
@@ -125,58 +125,58 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto make_input = [&](SumDegree o_sum,
                           DiscardCopyDegree o_eq,
-                          nonnegative_int o0,
-                          nonnegative_int o1,
-                          nonnegative_int o2,
-                          nonnegative_int o3) {
+                          positive_int o0,
+                          positive_int o1,
+                          positive_int o2,
+                          positive_int o3) {
       return lift_to_parallel_with_degrees(
-          input, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2, o3});
+          input, o_sum, o_eq, FFOrdered{o0, o1, o2, o3});
     };
 
     auto make_output = [&](SumDegree o_sum,
                            DiscardCopyDegree o_eq,
-                           nonnegative_int o0,
-                           nonnegative_int o1,
-                           nonnegative_int o2,
-                           nonnegative_int o3) {
+                           positive_int o0,
+                           positive_int o1,
+                           positive_int o2,
+                           positive_int o3) {
       return lift_to_parallel_with_degrees(
-          output, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2, o3});
+          output, o_sum, o_eq, FFOrdered{o0, o1, o2, o3});
     };
 
     auto make_gamma_weights = [&](SumDegree o_sum,
                                   DiscardCopyDegree o_eq,
-                                  nonnegative_int o0,
-                                  nonnegative_int o2) {
+                                  positive_int o0,
+                                  positive_int o2) {
       return lift_to_parallel_with_degrees(
-          gamma, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o2});
+          gamma, o_sum, o_eq, FFOrdered{o0, o2});
     };
 
     auto make_beta_weights = [&](SumDegree o_sum,
                                  DiscardCopyDegree o_eq,
-                                 nonnegative_int o0,
-                                 nonnegative_int o2) {
+                                 positive_int o0,
+                                 positive_int o2) {
       return lift_to_parallel_with_degrees(
-          beta, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o2});
+          beta, o_sum, o_eq, FFOrdered{o0, o2});
     };
 
     SUBCASE("parallel shape inference (LayerNorm)") {
       SUBCASE("partition parallelism (not in axes)") {
-        nonnegative_int degree0 = 2_n;
-        nonnegative_int degree2 = 3_n;
+        positive_int degree0 = 2_p;
+        positive_int degree2 = 3_p;
 
         ParallelTensorShape par_input = make_input(
-            SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2, 1_n);
+            SumDegree{1_p}, DiscardCopyDegree{1_p}, degree0, 1_p, degree2, 1_p);
 
         SUBCASE("get_output_shape(LayerNormAttrs, ParallelTensorShape)") {
           tl::expected<ParallelTensorShape, std::string> result =
               get_output_shape(attrs_affine_true, par_input);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_output(SumDegree{1_n},
-                          DiscardCopyDegree{1_n},
+              make_output(SumDegree{1_p},
+                          DiscardCopyDegree{1_p},
                           degree0,
-                          1_n,
+                          1_p,
                           degree2,
-                          1_n);
+                          1_p);
 
           CHECK(result == correct);
         }
@@ -188,7 +188,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                 get_gamma_weights_shape(attrs_affine_true, par_input);
             tl::expected<ParallelTensorShape, std::string> correct =
                 make_gamma_weights(
-                    SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, degree2);
+                    SumDegree{1_p}, DiscardCopyDegree{1_p}, degree0, degree2);
 
             CHECK(result == correct);
           }
@@ -208,7 +208,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                 get_beta_weights_shape(attrs_affine_true, par_input);
             tl::expected<ParallelTensorShape, std::string> correct =
                 make_beta_weights(
-                    SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, degree2);
+                    SumDegree{1_p}, DiscardCopyDegree{1_p}, degree0, degree2);
 
             CHECK(result == correct);
           }
@@ -224,11 +224,11 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
 
       SUBCASE("partition parallelism (in axes)") {
-        nonnegative_int degree1 = 2_n;
-        nonnegative_int degree2 = 4_n;
+        positive_int degree1 = 2_p;
+        positive_int degree2 = 4_p;
 
         ParallelTensorShape par_input = make_input(
-            SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree1, degree2, 1_n);
+            SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, degree1, degree2, 1_p);
 
         SUBCASE("get_output_shape(LayerNormAttrs, ParallelTensorShape)") {
           std::optional<ParallelTensorShape> result = optional_from_expected(
@@ -257,10 +257,10 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
 
       SUBCASE("sum parallelism") {
-        SumDegree sum_degree = SumDegree{2_n};
+        SumDegree sum_degree = SumDegree{2_p};
 
         ParallelTensorShape par_input =
-            make_input(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n, 1_n);
+            make_input(sum_degree, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p, 1_p);
 
         SUBCASE("get_output_shape(LayerNormAttrs, ParallelTensorShape)") {
           std::optional<ParallelTensorShape> result = optional_from_expected(
@@ -289,10 +289,10 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
 
       SUBCASE("discard copy parallelism") {
-        DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_n};
+        DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_p};
 
         ParallelTensorShape par_input =
-            make_input(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n, 1_n);
+            make_input(SumDegree{1_p}, discard_copy_degree, 1_p, 1_p, 1_p, 1_p);
 
         SUBCASE("get_output_shape(LayerNormAttrs, ParallelTensorShape)") {
           std::optional<ParallelTensorShape> result = optional_from_expected(
diff --git a/lib/op-attrs/test/src/op-attrs/ops/linear.cc b/lib/op-attrs/test/src/op-attrs/ops/linear.cc
index eaa99ef099..1ca936738b 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/linear.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/linear.cc
@@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_linear_incoming_tensor_roles(LinearAttrs)") {
     auto make_attrs = [](bool use_bias) {
       return LinearAttrs{
-          /*out_channels=*/16_n,
+          /*out_channels=*/16_p,
           /*use_bias=*/use_bias,
           /*data_type=*/DataType::FLOAT,
           /*activation=*/Activation::RELU,
@@ -47,7 +47,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("Linear shape inference") {
-    nonnegative_int out_channels = 16_n;
+    positive_int out_channels = 16_p;
     LinearAttrs attrs = LinearAttrs{
         /*out_channels=*/out_channels,
         /*use_bias=*/true,
@@ -56,13 +56,13 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*regularizer=*/std::nullopt,
     };
 
-    nonnegative_int batch_size = 12_n;
-    nonnegative_int extra_dim = 16_n;
-    nonnegative_int in_channels = 8_n;
+    positive_int batch_size = 12_p;
+    positive_int extra_dim = 16_p;
+    positive_int in_channels = 8_p;
 
     TensorShape input = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
+            FFOrdered{
                 batch_size,
                 extra_dim,
                 in_channels,
@@ -73,7 +73,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape output = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
+            FFOrdered{
                 batch_size,
                 extra_dim,
                 out_channels,
@@ -84,7 +84,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape projection = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
+            FFOrdered{
                 in_channels,
                 out_channels,
             },
@@ -94,7 +94,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape bias = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
+            FFOrdered{
                 out_channels,
             },
         },
@@ -127,66 +127,66 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto make_input = [&](SumDegree o_sum,
                           DiscardCopyDegree o_eq,
-                          nonnegative_int o_batch,
-                          nonnegative_int o_extra_dim,
-                          nonnegative_int o_channel) {
+                          positive_int o_batch,
+                          positive_int o_extra_dim,
+                          positive_int o_channel) {
       return lift_to_parallel_with_degrees(
           input,
           o_sum,
           o_eq,
-          FFOrdered<nonnegative_int>{o_batch, o_extra_dim, o_channel});
+          FFOrdered{o_batch, o_extra_dim, o_channel});
     };
 
     auto make_output = [&](SumDegree o_sum,
                            DiscardCopyDegree o_eq,
-                           nonnegative_int o_batch,
-                           nonnegative_int o_extra_dim,
-                           nonnegative_int o_channel) {
+                           positive_int o_batch,
+                           positive_int o_extra_dim,
+                           positive_int o_channel) {
       return lift_to_parallel_with_degrees(
           output,
           o_sum,
           o_eq,
-          FFOrdered<nonnegative_int>{o_batch, o_extra_dim, o_channel});
+          FFOrdered{o_batch, o_extra_dim, o_channel});
     };
 
     auto make_projection = [&](SumDegree o_sum,
                                DiscardCopyDegree o_eq,
-                               nonnegative_int o_inchannel,
-                               nonnegative_int o_outchannel) {
+                               positive_int o_inchannel,
+                               positive_int o_outchannel) {
       return lift_to_parallel_with_degrees(
           projection,
           o_sum,
           o_eq,
-          FFOrdered<nonnegative_int>{o_inchannel, o_outchannel});
+          FFOrdered{o_inchannel, o_outchannel});
     };
 
     auto make_bias = [&](SumDegree o_sum,
                          DiscardCopyDegree o_eq,
-                         nonnegative_int o_outchannel) {
+                         positive_int o_outchannel) {
       return lift_to_parallel_with_degrees(
-          bias, o_sum, o_eq, FFOrdered<nonnegative_int>{o_outchannel});
+          bias, o_sum, o_eq, FFOrdered{o_outchannel});
     };
 
     SUBCASE("data parallelism") {
-      nonnegative_int input_sum_degree = 2_n;
-      nonnegative_int extra_dim_degree = 8_n;
-      nonnegative_int degree = 4_n;
+      positive_int input_sum_degree = 2_p;
+      positive_int extra_dim_degree = 8_p;
+      positive_int degree = 4_p;
 
       ParallelTensorShape par_input = make_input(SumDegree{input_sum_degree},
-                                                 DiscardCopyDegree{1_n},
+                                                 DiscardCopyDegree{1_p},
                                                  degree,
                                                  extra_dim_degree,
-                                                 1_n);
+                                                 1_p);
 
       {
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct =
             make_output(SumDegree{input_sum_degree},
-                        DiscardCopyDegree{1_n},
+                        DiscardCopyDegree{1_p},
                         degree,
                         extra_dim_degree,
-                        1_n);
+                        1_p);
         CHECK(result == correct);
       }
 
@@ -195,10 +195,10 @@ TEST_SUITE(FF_TEST_SUITE) {
             get_projection_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct =
             make_projection(
-                SumDegree{1_n},
+                SumDegree{1_p},
                 DiscardCopyDegree{input_sum_degree * degree * extra_dim_degree},
-                1_n,
-                1_n);
+                1_p,
+                1_p);
         CHECK(result == correct);
       }
 
@@ -208,19 +208,19 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<ParallelTensorShape, std::string> correct =
             make_bias(SumDegree{input_sum_degree},
                       DiscardCopyDegree{degree * extra_dim_degree},
-                      1_n);
+                      1_p);
         CHECK(result == correct);
       }
     }
 
     SUBCASE("reduction parallelism") {
-      nonnegative_int input_sum_degree = 2_n;
-      nonnegative_int degree = 4_n;
+      positive_int input_sum_degree = 2_p;
+      positive_int degree = 4_p;
 
       ParallelTensorShape par_input = make_input(SumDegree{input_sum_degree},
-                                                 DiscardCopyDegree{1_n},
-                                                 1_n,
-                                                 1_n,
+                                                 DiscardCopyDegree{1_p},
+                                                 1_p,
+                                                 1_p,
                                                  degree);
 
       {
@@ -228,10 +228,10 @@ TEST_SUITE(FF_TEST_SUITE) {
             get_output_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct =
             make_output(SumDegree{input_sum_degree * degree},
-                        DiscardCopyDegree{1_n},
-                        1_n,
-                        1_n,
-                        1_n);
+                        DiscardCopyDegree{1_p},
+                        1_p,
+                        1_p,
+                        1_p);
         CHECK(result == correct);
       }
 
@@ -239,10 +239,10 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<ParallelTensorShape, std::string> result =
             get_projection_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct =
-            make_projection(SumDegree{1_n},
+            make_projection(SumDegree{1_p},
                             DiscardCopyDegree{input_sum_degree},
                             degree,
-                            1_n);
+                            1_p);
         CHECK(result == correct);
       }
 
@@ -250,29 +250,29 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<ParallelTensorShape, std::string> result =
             get_bias_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct = make_bias(
-            SumDegree{input_sum_degree * degree}, DiscardCopyDegree{1_n}, 1_n);
+            SumDegree{input_sum_degree * degree}, DiscardCopyDegree{1_p}, 1_p);
         CHECK(result == correct);
       }
     }
 
     SUBCASE("output channel parallelism") {
-      nonnegative_int input_sum_degree = 2_n;
-      nonnegative_int degree = 4_n;
+      positive_int input_sum_degree = 2_p;
+      positive_int degree = 4_p;
 
       ParallelTensorShape par_input = make_input(SumDegree{input_sum_degree},
                                                  DiscardCopyDegree{degree},
-                                                 1_n,
-                                                 1_n,
-                                                 1_n);
+                                                 1_p,
+                                                 1_p,
+                                                 1_p);
 
       {
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct =
             make_output(SumDegree{input_sum_degree},
-                        DiscardCopyDegree{1_n},
-                        1_n,
-                        1_n,
+                        DiscardCopyDegree{1_p},
+                        1_p,
+                        1_p,
                         degree);
         CHECK(result == correct);
       }
@@ -281,9 +281,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<ParallelTensorShape, std::string> result =
             get_projection_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct =
-            make_projection(SumDegree{1_n},
+            make_projection(SumDegree{1_p},
                             DiscardCopyDegree{input_sum_degree},
-                            1_n,
+                            1_p,
                             degree);
         CHECK(result == correct);
       }
@@ -292,7 +292,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<ParallelTensorShape, std::string> result =
             get_bias_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct = make_bias(
-            SumDegree{input_sum_degree}, DiscardCopyDegree{1_n}, degree);
+            SumDegree{input_sum_degree}, DiscardCopyDegree{1_p}, degree);
         CHECK(result == correct);
       }
     }
diff --git a/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc b/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc
index 6c14a226a2..9a27aafa5b 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc
@@ -9,25 +9,25 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("make_adaptive_pool2d") {
-    nonnegative_int input_n = 10_n;
-    nonnegative_int input_c = 11_n;
-    nonnegative_int input_h = 15_n;
-    nonnegative_int input_w = 20_n;
+    positive_int input_n = 10_p;
+    positive_int input_c = 11_p;
+    positive_int input_h = 15_p;
+    positive_int input_w = 20_p;
     Activation activation = Activation::RELU;
     PoolOp op = PoolOp::AVG;
 
     TensorDims input_dims = TensorDims{
-        FFOrdered<nonnegative_int>{input_n, input_c, input_h, input_w}};
+        FFOrdered{input_n, input_c, input_h, input_w}};
 
     SUBCASE("input_h divisible by output_h && input_w divisible by output_w") {
-      nonnegative_int output_h = 5_n;
-      nonnegative_int output_w = 2_n;
+      positive_int output_h = 5_p;
+      positive_int output_w = 2_p;
 
       Pool2DAttrs correct_attrs = Pool2DAttrs{
-          /*kernel_h=*/3_n,
-          /*kernel_w=*/10_n,
-          /*stride_h=*/3_n,
-          /*stride_w=*/10_n,
+          /*kernel_h=*/3_p,
+          /*kernel_w=*/10_p,
+          /*stride_h=*/3_p,
+          /*stride_w=*/10_p,
           /*padding_h=*/0_n,
           /*padding_w=*/0_n,
           /*pool_type=*/op,
@@ -50,7 +50,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<TensorShape, std::string> result =
             get_output_shape(correct_attrs, input_shape);
         tl::expected<TensorShape, std::string> correct = TensorShape{
-            TensorDims{FFOrdered<nonnegative_int>{
+            TensorDims{FFOrdered{
                 input_n,
                 input_c,
                 output_h,
@@ -64,8 +64,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("input_h not divisible by output_h") {
-      nonnegative_int output_h = 6_n;
-      nonnegative_int output_w = 2_n;
+      positive_int output_h = 6_p;
+      positive_int output_w = 2_p;
 
       std::optional<Pool2DAttrs> result =
           optional_from_expected(make_adaptive_pool2d_attrs(
@@ -76,8 +76,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("input_w not divisible by output_w") {
-      nonnegative_int output_h = 5_n;
-      nonnegative_int output_w = 3_n;
+      positive_int output_h = 5_p;
+      positive_int output_w = 3_p;
 
       std::optional<Pool2DAttrs> result =
           optional_from_expected(make_adaptive_pool2d_attrs(
@@ -88,14 +88,14 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("input_h == output_h and input_w == output_w") {
-      nonnegative_int output_h = input_h;
-      nonnegative_int output_w = input_w;
+      positive_int output_h = input_h;
+      positive_int output_w = input_w;
 
       Pool2DAttrs correct_attrs = Pool2DAttrs{
-          /*kernel_h=*/1_n,
-          /*kernel_w=*/1_n,
-          /*stride_h=*/1_n,
-          /*stride_w=*/1_n,
+          /*kernel_h=*/1_p,
+          /*kernel_w=*/1_p,
+          /*stride_h=*/1_p,
+          /*stride_w=*/1_p,
           /*padding_h=*/0_n,
           /*padding_w=*/0_n,
           /*pool_type=*/op,
@@ -126,10 +126,10 @@ TEST_SUITE(FF_TEST_SUITE) {
 
   TEST_CASE("get_output_shape(Pool2DAttrs, TensorShape)") {
     Pool2DAttrs attrs = Pool2DAttrs{
-        /*kernel_h=*/3_n,
-        /*kernel_w=*/2_n,
-        /*stride_h=*/2_n,
-        /*stride_w=*/2_n,
+        /*kernel_h=*/3_p,
+        /*kernel_w=*/2_p,
+        /*stride_h=*/2_p,
+        /*stride_w=*/2_p,
         /*padding_h=*/1_n,
         /*padding_w=*/1_n,
         /*pool_type=*/PoolOp::MAX,
@@ -138,10 +138,10 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("fails on non-4d inputs") {
       TensorShape input = TensorShape{
-          TensorDims{FFOrdered<nonnegative_int>{
-              10_n,
-              12_n,
-              14_n,
+          TensorDims{FFOrdered{
+              10_p,
+              12_p,
+              14_p,
           }},
           DataType::FLOAT,
       };
@@ -155,14 +155,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("4d input") {
       TensorShape input = TensorShape{
-          TensorDims{FFOrdered<nonnegative_int>{11_n, 13_n, 12_n, 6_n}},
+          TensorDims{FFOrdered{11_p, 13_p, 12_p, 6_p}},
           DataType::FLOAT,
       };
 
       tl::expected<TensorShape, std::string> result =
           get_output_shape(attrs, input);
       tl::expected<TensorShape, std::string> correct = TensorShape{
-          TensorDims{FFOrdered<nonnegative_int>{11_n, 13_n, 6_n, 4_n}},
+          TensorDims{FFOrdered{11_p, 13_p, 6_p, 4_p}},
           DataType::FLOAT,
       };
 
@@ -175,10 +175,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     auto make_attrs = [](PoolOp pool_type,
                          std::optional<Activation> const &activation) {
       return Pool2DAttrs{
-          /*kernel_h=*/3_n,
-          /*kernel_w=*/2_n,
-          /*stride_h=*/2_n,
-          /*stride_w=*/2_n,
+          /*kernel_h=*/3_p,
+          /*kernel_w=*/2_p,
+          /*stride_h=*/2_p,
+          /*stride_w=*/2_p,
           /*padding_h=*/1_n,
           /*padding_w=*/1_n,
           /*pool_type=*/pool_type,
@@ -190,13 +190,13 @@ TEST_SUITE(FF_TEST_SUITE) {
       Pool2DAttrs attrs = make_attrs(PoolOp::MAX, /*activation=*/std::nullopt);
 
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{1_n},
-          DiscardCopyDegree{1_n},
-          FFOrdered<nonnegative_int>{
-              4_n,
-              1_n,
-              1_n,
-              1_n,
+          SumDegree{1_p},
+          DiscardCopyDegree{1_p},
+          FFOrdered{
+              4_p,
+              1_p,
+              1_p,
+              1_p,
           },
       };
 
@@ -211,13 +211,13 @@ TEST_SUITE(FF_TEST_SUITE) {
       Pool2DAttrs attrs = make_attrs(PoolOp::MAX, /*activation=*/std::nullopt);
 
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{1_n},
-          DiscardCopyDegree{1_n},
-          FFOrdered<nonnegative_int>{
-              4_n,
-              2_n,
-              5_n,
-              6_n,
+          SumDegree{1_p},
+          DiscardCopyDegree{1_p},
+          FFOrdered{
+              4_p,
+              2_p,
+              5_p,
+              6_p,
           },
       };
 
@@ -232,13 +232,13 @@ TEST_SUITE(FF_TEST_SUITE) {
       Pool2DAttrs attrs = make_attrs(PoolOp::MAX, /*activation=*/std::nullopt);
 
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{1_n},
-          DiscardCopyDegree{3_n},
-          FFOrdered<nonnegative_int>{
-              1_n,
-              1_n,
-              1_n,
-              1_n,
+          SumDegree{1_p},
+          DiscardCopyDegree{3_p},
+          FFOrdered{
+              1_p,
+              1_p,
+              1_p,
+              1_p,
           },
       };
 
@@ -256,13 +256,13 @@ TEST_SUITE(FF_TEST_SUITE) {
               make_attrs(PoolOp::MAX, /*activation=*/std::nullopt);
 
           ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-              SumDegree{2_n},
-              DiscardCopyDegree{1_n},
-              FFOrdered<nonnegative_int>{
-                  1_n,
-                  1_n,
-                  1_n,
-                  1_n,
+              SumDegree{2_p},
+              DiscardCopyDegree{1_p},
+              FFOrdered{
+                  1_p,
+                  1_p,
+                  1_p,
+                  1_p,
               },
           };
 
@@ -279,13 +279,13 @@ TEST_SUITE(FF_TEST_SUITE) {
               make_attrs(PoolOp::AVG, /*activation=*/std::nullopt);
 
           ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-              SumDegree{2_n},
-              DiscardCopyDegree{1_n},
-              FFOrdered<nonnegative_int>{
-                  1_n,
-                  1_n,
-                  1_n,
-                  1_n,
+              SumDegree{2_p},
+              DiscardCopyDegree{1_p},
+              FFOrdered{
+                  1_p,
+                  1_p,
+                  1_p,
+                  1_p,
               },
           };
 
@@ -302,13 +302,13 @@ TEST_SUITE(FF_TEST_SUITE) {
             make_attrs(PoolOp::AVG, /*activation=*/Activation::RELU);
 
         ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-            SumDegree{2_n},
-            DiscardCopyDegree{1_n},
-            FFOrdered<nonnegative_int>{
-                1_n,
-                1_n,
-                1_n,
-                1_n,
+            SumDegree{2_p},
+            DiscardCopyDegree{1_p},
+            FFOrdered{
+                1_p,
+                1_p,
+                1_p,
+                1_p,
             },
         };
 
@@ -326,10 +326,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     // just do a single test to make sure it works/exists
 
     Pool2DAttrs attrs = Pool2DAttrs{
-        /*kernel_h=*/3_n,
-        /*kernel_w=*/2_n,
-        /*stride_h=*/2_n,
-        /*stride_w=*/2_n,
+        /*kernel_h=*/3_p,
+        /*kernel_w=*/2_p,
+        /*stride_h=*/2_p,
+        /*stride_w=*/2_p,
         /*padding_h=*/1_n,
         /*padding_w=*/1_n,
         /*pool_type=*/PoolOp::MAX,
@@ -340,14 +340,14 @@ TEST_SUITE(FF_TEST_SUITE) {
       ParallelTensorShape input = ParallelTensorShape{
           ParallelTensorDims{
               FFOrdered<ShardParallelDim>{
-                  ShardParallelDim{14_n, 7_n},
-                  ShardParallelDim{16_n, 8_n},
-                  ShardParallelDim{12_n, 3_n},
-                  ShardParallelDim{6_n, 2_n},
+                  ShardParallelDim{14_p, 7_p},
+                  ShardParallelDim{16_p, 8_p},
+                  ShardParallelDim{12_p, 3_p},
+                  ShardParallelDim{6_p, 2_p},
               },
               ReplicaParallelDimSet{
-                  SumDegree{1_n},
-                  DiscardCopyDegree{2_n},
+                  SumDegree{1_p},
+                  DiscardCopyDegree{2_p},
               },
           },
           DataType::FLOAT,
@@ -359,14 +359,14 @@ TEST_SUITE(FF_TEST_SUITE) {
           ParallelTensorShape{
               ParallelTensorDims{
                   FFOrdered<ShardParallelDim>{
-                      ShardParallelDim{14_n, 7_n},
-                      ShardParallelDim{16_n, 8_n},
-                      ShardParallelDim{6_n, 3_n},
-                      ShardParallelDim{4_n, 2_n},
+                      ShardParallelDim{14_p, 7_p},
+                      ShardParallelDim{16_p, 8_p},
+                      ShardParallelDim{6_p, 3_p},
+                      ShardParallelDim{4_p, 2_p},
                   },
                   ReplicaParallelDimSet{
-                      SumDegree{1_n},
-                      DiscardCopyDegree{2_n},
+                      SumDegree{1_p},
+                      DiscardCopyDegree{2_p},
                   },
               },
               DataType::FLOAT,
@@ -377,14 +377,14 @@ TEST_SUITE(FF_TEST_SUITE) {
       ParallelTensorShape input = ParallelTensorShape{
           ParallelTensorDims{
               FFOrdered<ShardParallelDim>{
-                  ShardParallelDim{14_n, 1_n},
-                  ShardParallelDim{16_n, 1_n},
-                  ShardParallelDim{12_n, 1_n},
-                  ShardParallelDim{6_n, 1_n},
+                  ShardParallelDim{14_p, 1_p},
+                  ShardParallelDim{16_p, 1_p},
+                  ShardParallelDim{12_p, 1_p},
+                  ShardParallelDim{6_p, 1_p},
               },
               ReplicaParallelDimSet{
-                  SumDegree{2_n},
-                  DiscardCopyDegree{1_n},
+                  SumDegree{2_p},
+                  DiscardCopyDegree{1_p},
               },
           },
           DataType::FLOAT,
diff --git a/lib/op-attrs/test/src/op-attrs/ops/reduction.cc b/lib/op-attrs/test/src/op-attrs/ops/reduction.cc
index dc12eb12a8..a480c840a3 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/reduction.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/reduction.cc
@@ -10,21 +10,21 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{12_n, 2_n},
-                ShardParallelDim{14_n, 1_n},
-                ShardParallelDim{16_n, 3_n},
-                ShardParallelDim{18_n, 2_n},
+                ShardParallelDim{12_p, 2_p},
+                ShardParallelDim{14_p, 1_p},
+                ShardParallelDim{16_p, 3_p},
+                ShardParallelDim{18_p, 2_p},
             },
             ReplicaParallelDimSet{
-                SumDegree{3_n},
-                DiscardCopyDegree{2_n},
+                SumDegree{3_p},
+                DiscardCopyDegree{2_p},
             },
         },
         DataType::FLOAT,
     };
 
     SUBCASE("valid") {
-      nonnegative_int degree = 3_n;
+      positive_int degree = 3_p;
       ReductionAttrs attrs = ReductionAttrs{
           /*repartition_degree=*/degree,
       };
@@ -34,7 +34,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       tl::expected<ParallelTensorShape, std::string> correct = [&] {
         ParallelTensorShape output = input;
-        output.dims.replica_dims.sum_degree.value /= degree;
+        positive_int old_sum_degree = output.dims.replica_dims.sum_degree.value;
+        output.dims.replica_dims.sum_degree.value = positive_int{old_sum_degree / degree};
         return output;
       }();
 
@@ -42,7 +43,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("invalid") {
-      nonnegative_int degree = 4_n;
+      positive_int degree = 4_p;
       ReductionAttrs attrs = ReductionAttrs{
           /*repartition_degree=*/degree,
       };
diff --git a/lib/op-attrs/test/src/op-attrs/ops/repartition.cc b/lib/op-attrs/test/src/op-attrs/ops/repartition.cc
index 36a265ce9f..3743cebc31 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/repartition.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/repartition.cc
@@ -7,7 +7,7 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Repartition shape inference") {
     ff_dim_t dim = ff_dim_t{2_n};
-    nonnegative_int degree = 4_n;
+    positive_int degree = 4_p;
     RepartitionAttrs attrs = RepartitionAttrs{
         /*repartition_dim=*/dim,
         /*repartition_degree=*/degree,
@@ -16,14 +16,14 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{12_n, 2_n},
-                ShardParallelDim{14_n, 1_n},
-                ShardParallelDim{16_n, 3_n},
-                ShardParallelDim{18_n, 2_n},
+                ShardParallelDim{12_p, 2_p},
+                ShardParallelDim{14_p, 1_p},
+                ShardParallelDim{16_p, 3_p},
+                ShardParallelDim{18_p, 2_p},
             },
             ReplicaParallelDimSet{
-                SumDegree{3_n},
-                DiscardCopyDegree{2_n},
+                SumDegree{3_p},
+                DiscardCopyDegree{2_p},
             },
         },
         DataType::FLOAT,
diff --git a/lib/op-attrs/test/src/op-attrs/ops/replicate.cc b/lib/op-attrs/test/src/op-attrs/ops/replicate.cc
index 770ae20d38..11ac7c02ab 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/replicate.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/replicate.cc
@@ -6,20 +6,20 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Replicate shape inference") {
     ReplicateAttrs attrs = ReplicateAttrs{
-        /*replicate_degree=*/4_n,
+        /*replicate_degree=*/4_p,
     };
 
     ParallelTensorShape input = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{10_n, 2_n},
-                ShardParallelDim{12_n, 1_n},
-                ShardParallelDim{14_n, 2_n},
-                ShardParallelDim{16_n, 2_n},
+                ShardParallelDim{10_p, 2_p},
+                ShardParallelDim{12_p, 1_p},
+                ShardParallelDim{14_p, 2_p},
+                ShardParallelDim{16_p, 2_p},
             },
             ReplicaParallelDimSet{
-                SumDegree{3_n},
-                DiscardCopyDegree{2_n},
+                SumDegree{3_p},
+                DiscardCopyDegree{2_p},
             },
         },
         DataType::FLOAT,
@@ -29,7 +29,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     ParallelTensorShape correct_output = input;
     correct_output.dims.replica_dims.discard_copy_degree =
-        DiscardCopyDegree{8_n};
+        DiscardCopyDegree{8_p};
 
     CHECK(result == correct_output);
   }
diff --git a/lib/op-attrs/test/src/op-attrs/ops/softmax.cc b/lib/op-attrs/test/src/op-attrs/ops/softmax.cc
index 8c80e348c0..29507565e8 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/softmax.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/softmax.cc
@@ -10,10 +10,10 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_output_shape(SoftmaxAttrs, TensorShape)") {
     TensorShape input = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{
-            12_n,
-            14_n,
-            16_n,
+        TensorDims{FFOrdered{
+            12_p,
+            14_p,
+            16_p,
         }},
         DataType::FLOAT,
     };
@@ -41,10 +41,10 @@ TEST_SUITE(FF_TEST_SUITE) {
 
   TEST_CASE("get_output_shape(SoftmaxAttrs, ParallelTensorShape)") {
     TensorShape input = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{
-            12_n,
-            14_n,
-            16_n,
+        TensorDims{FFOrdered{
+            12_p,
+            14_p,
+            16_p,
         }},
         DataType::FLOAT,
     };
@@ -52,28 +52,28 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto make_input = [&](SumDegree o_sum,
                           DiscardCopyDegree o_eq,
-                          nonnegative_int o0,
-                          nonnegative_int o1,
-                          nonnegative_int o2) {
+                          positive_int o0,
+                          positive_int o1,
+                          positive_int o2) {
       return lift_to_parallel_with_degrees(
-          input, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2});
+          input, o_sum, o_eq, FFOrdered{o0, o1, o2});
     };
 
     auto make_output = [&](SumDegree o_sum,
                            DiscardCopyDegree o_eq,
-                           nonnegative_int o0,
-                           nonnegative_int o1,
-                           nonnegative_int o2) {
+                           positive_int o0,
+                           positive_int o1,
+                           positive_int o2) {
       return lift_to_parallel_with_degrees(
-          output, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2});
+          output, o_sum, o_eq, FFOrdered{o0, o1, o2});
     };
 
     SUBCASE("partition parallelism in non-softmax-dim (valid)") {
-      nonnegative_int degree0 = 2_n;
-      nonnegative_int degree2 = 4_n;
+      positive_int degree0 = 2_p;
+      positive_int degree2 = 4_p;
 
       ParallelTensorShape par_input = make_input(
-          SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2);
+          SumDegree{1_p}, DiscardCopyDegree{1_p}, degree0, 1_p, degree2);
 
       SUBCASE("attrs.dim in bounds") {
         SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1_n}};
@@ -81,7 +81,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct = make_output(
-            SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2);
+            SumDegree{1_p}, DiscardCopyDegree{1_p}, degree0, 1_p, degree2);
 
         CHECK(result == correct);
       }
@@ -98,12 +98,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("partition parallism in softmax dim (invalid)") {
-      nonnegative_int degree1 = 2_n;
+      positive_int degree1 = 2_p;
 
       SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1_n}};
 
       ParallelTensorShape par_input =
-          make_input(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree1, 1_n);
+          make_input(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, degree1, 1_p);
 
       std::optional<ParallelTensorShape> result =
           optional_from_expected(get_output_shape(attrs, par_input));
@@ -113,12 +113,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("sum parallelism (invalid)") {
-      SumDegree sum_degree = SumDegree{2_n};
+      SumDegree sum_degree = SumDegree{2_p};
 
       SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1_n}};
 
       ParallelTensorShape par_input =
-          make_input(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
+          make_input(sum_degree, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p);
 
       std::optional<ParallelTensorShape> result =
           optional_from_expected(get_output_shape(attrs, par_input));
@@ -128,12 +128,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("discard copy parallelism (invalid)") {
-      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_n};
+      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_p};
 
       SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1_n}};
 
       ParallelTensorShape par_input =
-          make_input(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n);
+          make_input(SumDegree{1_p}, discard_copy_degree, 1_p, 1_p, 1_p);
 
       std::optional<ParallelTensorShape> result =
           optional_from_expected(get_output_shape(attrs, par_input));
diff --git a/lib/op-attrs/test/src/op-attrs/pcg_operator_attrs.cc b/lib/op-attrs/test/src/op-attrs/pcg_operator_attrs.cc
index 1187bfcfbf..f3d629cad8 100644
--- a/lib/op-attrs/test/src/op-attrs/pcg_operator_attrs.cc
+++ b/lib/op-attrs/test/src/op-attrs/pcg_operator_attrs.cc
@@ -7,7 +7,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("PCGOperatorAttrs to/from json") {
     PCGOperatorAttrs correct = PCGOperatorAttrs{RepartitionAttrs{
         /*repartition_dim=*/ff_dim_t{1_n},
-        /*repartition_degree=*/4_n,
+        /*repartition_degree=*/4_p,
     }};
     nlohmann::json j = correct;
     auto result = j.get<PCGOperatorAttrs>();
diff --git a/lib/op-attrs/test/src/op-attrs/tensor_dims.cc b/lib/op-attrs/test/src/op-attrs/tensor_dims.cc
index 7e072d82d9..044b50fae2 100644
--- a/lib/op-attrs/test/src/op-attrs/tensor_dims.cc
+++ b/lib/op-attrs/test/src/op-attrs/tensor_dims.cc
@@ -8,7 +8,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("tensor_dims_is_broadcastable_to(TensorDims, TensorDims)") {
 
     TensorDims goal =
-        TensorDims{FFOrdered<nonnegative_int>{1_n, 1_n, 4_n, 3_n}};
+        TensorDims{FFOrdered{1_p, 1_p, 4_p, 3_p}};
 
     SUBCASE("dims match") {
       bool result = tensor_dims_is_broadcastable_to(goal, goal);
@@ -18,7 +18,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("curr only needs num_dims promotion") {
-      TensorDims curr = TensorDims{FFOrdered<nonnegative_int>{4_n, 3_n}};
+      TensorDims curr = TensorDims{FFOrdered{4_p, 3_p}};
 
       bool result = tensor_dims_is_broadcastable_to(curr, goal);
       bool correct = true;
@@ -28,7 +28,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("curr only needs dim expansion") {
       TensorDims curr =
-          TensorDims{FFOrdered<nonnegative_int>{1_n, 1_n, 1_n, 3_n}};
+          TensorDims{FFOrdered{1_p, 1_p, 1_p, 3_p}};
 
       bool result = tensor_dims_is_broadcastable_to(curr, goal);
       bool correct = true;
@@ -37,7 +37,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("curr needs both num_dims promotion and dim expansion") {
-      TensorDims curr = TensorDims{FFOrdered<nonnegative_int>{1_n, 3_n}};
+      TensorDims curr = TensorDims{FFOrdered{1_p, 3_p}};
 
       bool result = tensor_dims_is_broadcastable_to(curr, goal);
       bool correct = true;
@@ -47,7 +47,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("curr needs invalid dim promotion") {
       TensorDims curr =
-          TensorDims{FFOrdered<nonnegative_int>{1_n, 1_n, 2_n, 3_n}};
+          TensorDims{FFOrdered{1_p, 1_p, 2_p, 3_p}};
 
       bool result = tensor_dims_is_broadcastable_to(curr, goal);
       bool correct = false;
@@ -57,7 +57,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("num_dims(goal) < num_dims(curr)") {
       TensorDims curr =
-          TensorDims{FFOrdered<nonnegative_int>{1_n, 1_n, 10_n, 4_n, 3_n}};
+          TensorDims{FFOrdered{1_p, 1_p, 10_p, 4_p, 3_p}};
 
       bool result = tensor_dims_is_broadcastable_to(curr, goal);
       bool correct = false;
@@ -67,13 +67,13 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("get_broadcast_target_dims(std::unordered_set<TensorDims>)") {
-    TensorDims d1 = TensorDims{FFOrdered<nonnegative_int>{1_n, 10_n, 4_n, 3_n}};
+    TensorDims d1 = TensorDims{FFOrdered{1_p, 10_p, 4_p, 3_p}};
 
-    TensorDims d2 = TensorDims{FFOrdered<nonnegative_int>{10_n, 4_n, 1_n}};
+    TensorDims d2 = TensorDims{FFOrdered{10_p, 4_p, 1_p}};
 
     SUBCASE("has target in inputs") {
       TensorDims d3 =
-          TensorDims{FFOrdered<nonnegative_int>{1_n, 1_n, 4_n, 3_n}};
+          TensorDims{FFOrdered{1_p, 1_p, 4_p, 3_p}};
 
       std::optional<TensorDims> result =
           get_broadcast_target_dims({d1, d2, d3});
@@ -84,7 +84,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("has no possible target") {
       TensorDims d3 =
-          TensorDims{FFOrdered<nonnegative_int>{1_n, 1_n, 1_n, 4_n}};
+          TensorDims{FFOrdered{1_p, 1_p, 1_p, 4_p}};
 
       std::optional<TensorDims> result =
           get_broadcast_target_dims({d1, d2, d3});
@@ -95,10 +95,10 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("has possible target, but not in inputs") {
       TensorDims d3 =
-          TensorDims{FFOrdered<nonnegative_int>{1_n, 1_n, 1_n, 4_n, 3_n}};
+          TensorDims{FFOrdered{1_p, 1_p, 1_p, 4_p, 3_p}};
 
       TensorDims possible_target =
-          TensorDims{FFOrdered<nonnegative_int>{1_n, 1_n, 10_n, 4_n, 3_n}};
+          TensorDims{FFOrdered{1_p, 1_p, 10_p, 4_p, 3_p}};
 
       REQUIRE(tensor_dims_is_broadcastable_to(d1, possible_target));
       REQUIRE(tensor_dims_is_broadcastable_to(d2, possible_target));
diff --git a/lib/pcg/include/pcg/computation_graph_builder.h b/lib/pcg/include/pcg/computation_graph_builder.h
index b996026ce7..2be2a54cd8 100644
--- a/lib/pcg/include/pcg/computation_graph_builder.h
+++ b/lib/pcg/include/pcg/computation_graph_builder.h
@@ -84,15 +84,15 @@ struct ComputationGraphBuilder {
   // Add a 2D convolutional layer
   tensor_guid_t conv2d(
       tensor_guid_t const &input,
-      nonnegative_int outChannels,
-      nonnegative_int kernelH,
-      nonnegative_int kernelW,
-      nonnegative_int strideH,
-      nonnegative_int strideW,
+      positive_int outChannels,
+      positive_int kernelH,
+      positive_int kernelW,
+      positive_int strideH,
+      positive_int strideW,
       nonnegative_int paddingH,
       nonnegative_int paddingW,
       std::optional<Activation> const &activation = std::nullopt,
-      nonnegative_int groups = 1_n,
+      positive_int groups = 1_p,
       bool use_bias = true,
       std::optional<InitializerAttrs> const &kernel_initializer = std::nullopt,
       std::optional<InitializerAttrs> const &bias_initializer = std::nullopt,
@@ -106,8 +106,8 @@ struct ComputationGraphBuilder {
   // Add an embedding layer
   tensor_guid_t embedding(
       tensor_guid_t const &input,
-      nonnegative_int num_entries,
-      nonnegative_int outDim,
+      positive_int num_entries,
+      positive_int outDim,
       AggregateOp aggr,
       DataType dtype = DataType::FLOAT,
       std::optional<InitializerAttrs> const &initializer = std::nullopt,
@@ -127,10 +127,10 @@ struct ComputationGraphBuilder {
   // Add a 2D pooling layer
   tensor_guid_t
       pool2d(tensor_guid_t const &input,
-             nonnegative_int kernelH,
-             nonnegative_int kernelW,
-             nonnegative_int strideH,
-             nonnegative_int strideW,
+             positive_int kernelH,
+             positive_int kernelW,
+             positive_int strideH,
+             positive_int strideW,
              nonnegative_int paddingH,
              nonnegative_int paddingW,
              PoolOp type = PoolOp::MAX,
@@ -138,8 +138,8 @@ struct ComputationGraphBuilder {
              std::optional<std::string> const &name = std::nullopt);
   tensor_guid_t adaptive_pool2d(
       tensor_guid_t const &input,
-      nonnegative_int output_h,
-      nonnegative_int output_w,
+      positive_int output_h,
+      positive_int output_w,
       PoolOp type = PoolOp::MAX,
       std::optional<Activation> const &activation = std::nullopt,
       std::optional<std::string> const &name = std::nullopt);
@@ -164,7 +164,7 @@ struct ComputationGraphBuilder {
       std::optional<std::string> const &name = std::nullopt);
   tensor_guid_t dense(
       tensor_guid_t const &input,
-      nonnegative_int outDim,
+      positive_int outDim,
       std::optional<Activation> activation = std::nullopt,
       bool use_bias = true,
       DataType data_type = DataType::FLOAT,
@@ -226,10 +226,10 @@ struct ComputationGraphBuilder {
       tensor_guid_t const &query,
       tensor_guid_t const &key,
       tensor_guid_t const &value,
-      nonnegative_int embed_dim,
-      nonnegative_int num_heads,
-      nonnegative_int kdim = 0_n,
-      nonnegative_int vdim = 0_n,
+      positive_int embed_dim,
+      positive_int num_heads,
+      std::optional<positive_int> const &kdim = std::nullopt,
+      std::optional<positive_int> const &vdim = std::nullopt,
       float dropout = 0.0f,
       bool bias = true,
       bool add_bias_kv = false,
diff --git a/lib/pcg/include/pcg/machine_specification.h b/lib/pcg/include/pcg/machine_specification.h
index 11c5a81bba..863d9909c0 100644
--- a/lib/pcg/include/pcg/machine_specification.h
+++ b/lib/pcg/include/pcg/machine_specification.h
@@ -8,11 +8,11 @@
 
 namespace FlexFlow {
 
-nonnegative_int get_num_gpus(MachineSpecification const &ms);
-nonnegative_int get_num_cpus(MachineSpecification const &ms);
-nonnegative_int get_num_devices(MachineSpecification const &ms,
+positive_int get_num_gpus(MachineSpecification const &ms);
+positive_int get_num_cpus(MachineSpecification const &ms);
+positive_int get_num_devices(MachineSpecification const &ms,
                                 DeviceType const &device_type);
-nonnegative_int get_num_devices_per_node(MachineSpecification const &ms,
+positive_int get_num_devices_per_node(MachineSpecification const &ms,
                                          DeviceType const &device_type);
 
 bool is_valid_machine_space_coordinate(MachineSpecification const &ms,
diff --git a/lib/pcg/include/pcg/machine_specification.struct.toml b/lib/pcg/include/pcg/machine_specification.struct.toml
index 7c624c7240..49e9bd9d78 100644
--- a/lib/pcg/include/pcg/machine_specification.struct.toml
+++ b/lib/pcg/include/pcg/machine_specification.struct.toml
@@ -10,20 +10,20 @@ features = [
 ]
 
 includes = [
-  "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 [[fields]]
 name = "num_nodes"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "num_cpus_per_node"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "num_gpus_per_node"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
 
 [[fields]]
 name = "inter_node_bandwidth"
diff --git a/lib/pcg/include/pcg/operator_task_space.h b/lib/pcg/include/pcg/operator_task_space.h
index b095fad088..ceb0146f15 100644
--- a/lib/pcg/include/pcg/operator_task_space.h
+++ b/lib/pcg/include/pcg/operator_task_space.h
@@ -17,7 +17,7 @@ TaskSpaceCoordinate
     get_task_space_maximum_coordinate(OperatorTaskSpace const &task);
 
 nonnegative_int num_dims(OperatorTaskSpace const &task);
-nonnegative_int num_tasks(OperatorTaskSpace const &task);
+positive_int num_tasks(OperatorTaskSpace const &task);
 
 OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg,
                                           parallel_layer_guid_t const &layer);
diff --git a/lib/pcg/include/pcg/operator_task_space.struct.toml b/lib/pcg/include/pcg/operator_task_space.struct.toml
index 9cc4f6b93a..389e12e8f2 100644
--- a/lib/pcg/include/pcg/operator_task_space.struct.toml
+++ b/lib/pcg/include/pcg/operator_task_space.struct.toml
@@ -5,13 +5,13 @@ features = [
   "ord",
   "hash",
   "json",
-  # "rapidcheck",
+  "rapidcheck",
   "fmt",
 ]
 
 includes = [ 
   "<vector>",
-  "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 src_includes = [
@@ -21,4 +21,4 @@ src_includes = [
 
 [[fields]]
 name = "degrees"
-type = "std::vector<::FlexFlow::nonnegative_int>"
+type = "std::vector<::FlexFlow::positive_int>"
diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h
index d4cace4a2a..aad2770101 100644
--- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h
+++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h
@@ -32,15 +32,15 @@ struct ParallelComputationGraphBuilder {
 
   parallel_tensor_guid_t conv2d(
       parallel_tensor_guid_t const &input,
-      nonnegative_int outChannels,
-      nonnegative_int kernelH,
-      nonnegative_int kernelW,
-      nonnegative_int strideH,
-      nonnegative_int strideW,
+      positive_int outChannels,
+      positive_int kernelH,
+      positive_int kernelW,
+      positive_int strideH,
+      positive_int strideW,
       nonnegative_int paddingH,
       nonnegative_int paddingW,
       std::optional<Activation> const &activation = std::nullopt,
-      nonnegative_int groups = 1_n,
+      positive_int groups = 1_p,
       bool use_bias = true,
       std::optional<InitializerAttrs> const &kernel_initializer = std::nullopt,
       std::optional<InitializerAttrs> const &bias_initializer = std::nullopt,
@@ -49,7 +49,7 @@ struct ParallelComputationGraphBuilder {
 
   parallel_tensor_guid_t dense(
       parallel_tensor_guid_t const &input,
-      nonnegative_int outDim,
+      positive_int outDim,
       std::optional<Activation> activation = std::nullopt,
       bool use_bias = true,
       DataType data_type = DataType::FLOAT,
@@ -60,8 +60,8 @@ struct ParallelComputationGraphBuilder {
 
   parallel_tensor_guid_t embedding(
       parallel_tensor_guid_t const &input,
-      nonnegative_int num_entries,
-      nonnegative_int outDim,
+      positive_int num_entries,
+      positive_int outDim,
       AggregateOp aggr,
       DataType dtype = DataType::FLOAT,
       std::optional<InitializerAttrs> const &kernel_initializer = std::nullopt,
@@ -71,10 +71,10 @@ struct ParallelComputationGraphBuilder {
       parallel_tensor_guid_t const &query,
       parallel_tensor_guid_t const &key,
       parallel_tensor_guid_t const &value,
-      nonnegative_int embed_dim,
-      nonnegative_int num_heads,
-      std::optional<nonnegative_int> kdim = std::nullopt,
-      std::optional<nonnegative_int> vdim = std::nullopt,
+      positive_int embed_dim,
+      positive_int num_heads,
+      std::optional<positive_int> kdim = std::nullopt,
+      std::optional<positive_int> vdim = std::nullopt,
       float dropout = 0.0f,
       bool bias = true,
       bool add_bias_kv = false,
@@ -119,20 +119,20 @@ struct ParallelComputationGraphBuilder {
   parallel_tensor_guid_t
       parallel_partition(parallel_tensor_guid_t const &input,
                          ff_dim_t dim,
-                         nonnegative_int degree,
+                         positive_int degree,
                          std::optional<std::string> const &name = std::nullopt);
   parallel_tensor_guid_t
       parallel_combine(parallel_tensor_guid_t const &x,
                        ff_dim_t dim,
-                       nonnegative_int degree,
+                       positive_int degree,
                        std::optional<std::string> const &name = std::nullopt);
   parallel_tensor_guid_t
       parallel_replicate(parallel_tensor_guid_t const &x,
-                         nonnegative_int degree,
+                         positive_int degree,
                          std::optional<std::string> const &name = std::nullopt);
   parallel_tensor_guid_t
       parallel_reduce(parallel_tensor_guid_t const &x,
-                      nonnegative_int degree,
+                      positive_int degree,
                       std::optional<std::string> const &name = std::nullopt);
 
   ParallelTensorShape get_shape(parallel_tensor_guid_t const &) const;
diff --git a/lib/pcg/include/pcg/stride_t.struct.toml b/lib/pcg/include/pcg/stride_t.struct.toml
index 8d950c5f39..3f07ec6b01 100644
--- a/lib/pcg/include/pcg/stride_t.struct.toml
+++ b/lib/pcg/include/pcg/stride_t.struct.toml
@@ -10,9 +10,9 @@ features = [
 ]
 
 includes = [
-  "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 [[fields]]
 name = "unwrapped"
-type = "::FlexFlow::nonnegative_int"
+type = "::FlexFlow::positive_int"
diff --git a/lib/pcg/src/pcg/computation_graph_builder.cc b/lib/pcg/src/pcg/computation_graph_builder.cc
index 267f05499c..0a24acc6aa 100644
--- a/lib/pcg/src/pcg/computation_graph_builder.cc
+++ b/lib/pcg/src/pcg/computation_graph_builder.cc
@@ -378,15 +378,15 @@ tensor_guid_t
 
 tensor_guid_t ComputationGraphBuilder::conv2d(
     tensor_guid_t const &x,
-    nonnegative_int outChannels,
-    nonnegative_int kernelH,
-    nonnegative_int kernelW,
-    nonnegative_int strideH,
-    nonnegative_int strideW,
+    positive_int outChannels,
+    positive_int kernelH,
+    positive_int kernelW,
+    positive_int strideH,
+    positive_int strideW,
     nonnegative_int paddingH,
     nonnegative_int paddingW,
     std::optional<Activation> const &activation,
-    nonnegative_int groups,
+    positive_int groups,
     bool use_bias,
     std::optional<InitializerAttrs> const &maybe_kernel_initializer,
     std::optional<InitializerAttrs> const &maybe_bias_initializer,
@@ -440,8 +440,8 @@ tensor_guid_t ComputationGraphBuilder::dropout(
 
 tensor_guid_t ComputationGraphBuilder::embedding(
     tensor_guid_t const &input,
-    nonnegative_int num_entries,
-    nonnegative_int outDim,
+    positive_int num_entries,
+    positive_int outDim,
     AggregateOp aggr,
     DataType dtype,
     std::optional<InitializerAttrs> const &initializer,
@@ -491,10 +491,10 @@ tensor_guid_t ComputationGraphBuilder::gather(
 }
 tensor_guid_t ComputationGraphBuilder::pool2d(
     tensor_guid_t const &x,
-    nonnegative_int kernelH,
-    nonnegative_int kernelW,
-    nonnegative_int strideH,
-    nonnegative_int strideW,
+    positive_int kernelH,
+    positive_int kernelW,
+    positive_int strideH,
+    positive_int strideW,
     nonnegative_int paddingH,
     nonnegative_int paddingW,
     PoolOp type,
@@ -525,8 +525,8 @@ tensor_guid_t ComputationGraphBuilder::pool2d(
 
 tensor_guid_t ComputationGraphBuilder::adaptive_pool2d(
     tensor_guid_t const &uncasted_input,
-    nonnegative_int output_h,
-    nonnegative_int output_w,
+    positive_int output_h,
+    positive_int output_w,
     PoolOp type,
     std::optional<Activation> const &activation,
     std::optional<std::string> const &maybe_name) {
@@ -591,10 +591,10 @@ tensor_guid_t ComputationGraphBuilder::multihead_attention(
     tensor_guid_t const &query,
     tensor_guid_t const &key,
     tensor_guid_t const &value,
-    nonnegative_int embed_dim,
-    nonnegative_int num_heads,
-    nonnegative_int kdim,
-    nonnegative_int vdim,
+    positive_int embed_dim,
+    positive_int num_heads,
+    std::optional<positive_int> const &kdim,
+    std::optional<positive_int> const &vdim,
     float dropout,
     bool bias,
     bool add_bias_kv,
@@ -619,8 +619,8 @@ tensor_guid_t ComputationGraphBuilder::multihead_attention(
   MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{
       /*embed_dim=*/embed_dim,
       /*num_heads=*/num_heads,
-      /*kdim=*/kdim,
-      /*vdim=*/vdim,
+      /*kdim=*/kdim.value_or(embed_dim),
+      /*vdim=*/vdim.value_or(embed_dim),
       /*dropout=*/dropout,
       /*bias=*/bias,
       /*add_bias_kv=*/add_bias_kv,
@@ -667,7 +667,7 @@ TensorDims ComputationGraphBuilder::get_broadcast_target_dims(
 
 tensor_guid_t ComputationGraphBuilder::dense(
     tensor_guid_t const &input,
-    nonnegative_int outDim,
+    positive_int outDim,
     std::optional<Activation> activation,
     bool use_bias,
     DataType data_type,
diff --git a/lib/pcg/src/pcg/machine_specification.cc b/lib/pcg/src/pcg/machine_specification.cc
index 0fefeddd27..08afa415af 100644
--- a/lib/pcg/src/pcg/machine_specification.cc
+++ b/lib/pcg/src/pcg/machine_specification.cc
@@ -2,17 +2,18 @@
 #include "pcg/device_id.h"
 #include "utils/containers/transform.h"
 #include "utils/exception.h"
+
 namespace FlexFlow {
 
-nonnegative_int get_num_gpus(MachineSpecification const &ms) {
+positive_int get_num_gpus(MachineSpecification const &ms) {
   return ms.num_nodes * ms.num_gpus_per_node;
 }
 
-nonnegative_int get_num_cpus(MachineSpecification const &ms) {
+positive_int get_num_cpus(MachineSpecification const &ms) {
   return ms.num_nodes * ms.num_cpus_per_node;
 }
 
-nonnegative_int get_num_devices(MachineSpecification const &ms,
+positive_int get_num_devices(MachineSpecification const &ms,
                                 DeviceType const &device_type) {
   switch (device_type) {
     case DeviceType::GPU:
@@ -24,7 +25,7 @@ nonnegative_int get_num_devices(MachineSpecification const &ms,
   }
 }
 
-nonnegative_int get_num_devices_per_node(MachineSpecification const &ms,
+positive_int get_num_devices_per_node(MachineSpecification const &ms,
                                          DeviceType const &device_type) {
   switch (device_type) {
     case DeviceType::GPU:
diff --git a/lib/pcg/src/pcg/machine_view.cc b/lib/pcg/src/pcg/machine_view.cc
index 88110f914a..3afa73ca62 100644
--- a/lib/pcg/src/pcg/machine_view.cc
+++ b/lib/pcg/src/pcg/machine_view.cc
@@ -91,7 +91,7 @@ std::optional<MachineSpaceCoordinate> get_machine_space_coordinate(
           std::vector<nonnegative_int> const &dimension_indices) {
         std::vector<stride_t> mv_strides = get_strides(machine_view);
 
-        std::vector<nonnegative_int> sizes =
+        std::vector<positive_int> sizes =
             transform(dimension_indices, [&](nonnegative_int i) {
               return task.degrees.at(i.unwrap_nonnegative()) *
                      mv_strides.at(i.unwrap_nonnegative()).unwrapped;
@@ -100,13 +100,13 @@ std::optional<MachineSpaceCoordinate> get_machine_space_coordinate(
             transform(dimension_indices, [&](nonnegative_int i) {
               return coord.raw_coord.at(i.unwrap_nonnegative());
             });
-        std::vector<nonnegative_int> strides =
+        std::vector<positive_int> strides =
             transform(dimension_indices, [&](nonnegative_int i) {
               return mv_strides.at(i.unwrap_nonnegative()).unwrapped;
             });
 
-        std::vector<nonnegative_int> coeffs = scanl(
-            sizes, nonnegative_int{1}, std::multiplies<nonnegative_int>());
+        std::vector<positive_int> coeffs = scanl(
+            sizes, 1_p, std::multiplies<positive_int>());
 
         nonnegative_int index = start_idx;
         for (auto [coeff, coord_point, stride] :
diff --git a/lib/pcg/src/pcg/operator_task_space.cc b/lib/pcg/src/pcg/operator_task_space.cc
index 57af6eedc7..36ad43f3d3 100644
--- a/lib/pcg/src/pcg/operator_task_space.cc
+++ b/lib/pcg/src/pcg/operator_task_space.cc
@@ -23,8 +23,8 @@ std::unordered_set<TaskSpaceCoordinate>
     get_task_space_coordinates(OperatorTaskSpace const &task) {
 
   std::vector<std::vector<nonnegative_int>> coordinate_ranges =
-      transform(task.degrees, [&](nonnegative_int num_points) {
-        return nonnegative_range(num_points);
+      transform(task.degrees, [&](positive_int num_points) {
+        return nonnegative_range(num_points.nonnegative_int_from_positive_int());
       });
 
   std::unordered_set<std::vector<nonnegative_int>> raw_coordinates =
@@ -45,7 +45,7 @@ nonnegative_int num_dims(OperatorTaskSpace const &task) {
   return num_elements(task.degrees);
 }
 
-nonnegative_int num_tasks(OperatorTaskSpace const &task) {
+positive_int num_tasks(OperatorTaskSpace const &task) {
   return product(task.degrees);
 }
 
@@ -54,7 +54,7 @@ OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg,
   parallel_tensor_guid_t out_tensor = get_layer_outputs(pcg, layer).at(0);
   ParallelTensorShape shape = get_parallel_tensor_shape(pcg, out_tensor);
 
-  std::vector<nonnegative_int> degrees;
+  std::vector<positive_int> degrees;
   extend(degrees, vector_of(ff_ordered_shard_degrees(shape)));
   degrees.push_back(get_sum_degree(shape));
   degrees.push_back(get_discard_copy_degree(shape));
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc b/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc
index 940024c9b6..e3caffe260 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc
@@ -9,7 +9,7 @@ std::unordered_set<ParallelOpAttrs>
                               ParallelTensorShape const &goal) {
   std::unordered_set<ParallelOpAttrs> result;
 
-  nonnegative_int sum_degree = get_sum_degree(goal);
+  positive_int sum_degree = get_sum_degree(goal);
   if (sum_degree != 1) {
     throw mk_runtime_error(
         fmt::format("generate_weight_transform currently only supports "
@@ -17,7 +17,7 @@ std::unordered_set<ParallelOpAttrs>
                     sum_degree));
   }
 
-  nonnegative_int discard_copy_degree = get_discard_copy_degree(goal);
+  positive_int discard_copy_degree = get_discard_copy_degree(goal);
   if (discard_copy_degree != 1) {
     result.insert(ParallelOpAttrs{ReplicateAttrs{discard_copy_degree}});
   }
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
index 4e72b2fe0f..f7f3cfdcfd 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
@@ -128,15 +128,15 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::cast(
 
 parallel_tensor_guid_t ParallelComputationGraphBuilder::conv2d(
     parallel_tensor_guid_t const &raw_input,
-    nonnegative_int outChannels,
-    nonnegative_int kernelH,
-    nonnegative_int kernelW,
-    nonnegative_int strideH,
-    nonnegative_int strideW,
+    positive_int outChannels,
+    positive_int kernelH,
+    positive_int kernelW,
+    positive_int strideH,
+    positive_int strideW,
     nonnegative_int paddingH,
     nonnegative_int paddingW,
     std::optional<Activation> const &activation,
-    nonnegative_int groups,
+    positive_int groups,
     bool use_bias,
     std::optional<InitializerAttrs> const &maybe_kernel_initializer,
     std::optional<InitializerAttrs> const &maybe_bias_initializer,
@@ -176,7 +176,7 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::conv2d(
 
 parallel_tensor_guid_t ParallelComputationGraphBuilder::dense(
     parallel_tensor_guid_t const &input,
-    nonnegative_int outDim,
+    positive_int outDim,
     std::optional<Activation> activation,
     bool use_bias,
     DataType data_type,
@@ -209,8 +209,8 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::dense(
 
 parallel_tensor_guid_t ParallelComputationGraphBuilder::embedding(
     parallel_tensor_guid_t const &input,
-    nonnegative_int num_entries,
-    nonnegative_int outDim,
+    positive_int num_entries,
+    positive_int outDim,
     AggregateOp aggr,
     DataType dtype,
     std::optional<InitializerAttrs> const &maybe_kernel_initializer,
@@ -238,10 +238,10 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::multihead_attention(
     parallel_tensor_guid_t const &query,
     parallel_tensor_guid_t const &key,
     parallel_tensor_guid_t const &value,
-    nonnegative_int embed_dim,
-    nonnegative_int num_heads,
-    std::optional<nonnegative_int> maybe_kdim,
-    std::optional<nonnegative_int> maybe_vdim,
+    positive_int embed_dim,
+    positive_int num_heads,
+    std::optional<positive_int> maybe_kdim,
+    std::optional<positive_int> maybe_vdim,
     float dropout,
     bool bias,
     bool add_bias_kv,
@@ -251,8 +251,8 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::multihead_attention(
     std::optional<InitializerAttrs> maybe_output_bias_initializer,
     std::optional<std::string> const &maybe_name) {
 
-  nonnegative_int kdim = maybe_kdim.value_or(embed_dim);
-  nonnegative_int vdim = maybe_vdim.value_or(embed_dim);
+  positive_int kdim = maybe_kdim.value_or(embed_dim);
+  positive_int vdim = maybe_vdim.value_or(embed_dim);
 
   MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{
       /*embed_dim=*/embed_dim,
@@ -409,7 +409,7 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::elu(
 parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_partition(
     parallel_tensor_guid_t const &input,
     ff_dim_t dim,
-    nonnegative_int degree,
+    positive_int degree,
     std::optional<std::string> const &maybe_name) {
 
   RepartitionAttrs attrs = RepartitionAttrs{
@@ -428,7 +428,7 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_partition(
 parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_combine(
     parallel_tensor_guid_t const &input,
     ff_dim_t dim,
-    nonnegative_int degree,
+    positive_int degree,
     std::optional<std::string> const &maybe_name) {
 
   CombineAttrs attrs = CombineAttrs{
@@ -446,7 +446,7 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_combine(
 
 parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_replicate(
     parallel_tensor_guid_t const &input,
-    nonnegative_int degree,
+    positive_int degree,
     std::optional<std::string> const &maybe_name) {
 
   ReplicateAttrs attrs = ReplicateAttrs{degree};
@@ -461,7 +461,7 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_replicate(
 
 parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_reduce(
     parallel_tensor_guid_t const &input,
-    nonnegative_int degree,
+    positive_int degree,
     std::optional<std::string> const &maybe_name) {
 
   ReductionAttrs attrs = ReductionAttrs{degree};
diff --git a/lib/pcg/test/src/pcg/computation_graph.cc b/lib/pcg/test/src/pcg/computation_graph.cc
index 341801d0b0..8451545e32 100644
--- a/lib/pcg/test/src/pcg/computation_graph.cc
+++ b/lib/pcg/test/src/pcg/computation_graph.cc
@@ -14,9 +14,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         ComputationGraphBuilder b;
 
         TensorShape input_shape = TensorShape{
-            TensorDims{FFOrdered<nonnegative_int>{
-                10_n,
-                12_n,
+            TensorDims{FFOrdered{
+                10_p,
+                12_p,
             }},
             DataType::FLOAT,
         };
@@ -41,9 +41,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       ComputationGraphBuilder b;
 
       TensorShape input_shape = TensorShape{
-          TensorDims{FFOrdered<nonnegative_int>{
-              10_n,
-              12_n,
+          TensorDims{FFOrdered{
+              10_p,
+              12_p,
           }},
           DataType::FLOAT,
       };
@@ -67,16 +67,16 @@ TEST_SUITE(FF_TEST_SUITE) {
       ComputationGraphBuilder b;
 
       TensorShape input_shape = TensorShape{
-          TensorDims{FFOrdered<nonnegative_int>{
-              10_n,
-              12_n,
+          TensorDims{FFOrdered{
+              10_p,
+              12_p,
           }},
           DataType::FLOAT,
       };
 
       tensor_guid_t input = b.create_input(input_shape, CreateGrad::YES);
       b.dense(input,
-              /*outDim=*/14_n,
+              /*outDim=*/14_p,
               /*activation=*/Activation::RELU,
               /*use_bias=*/true,
               /*data_type=*/DataType::FLOAT,
@@ -104,9 +104,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         ComputationGraphBuilder b;
 
         TensorShape input_shape = TensorShape{
-            TensorDims{FFOrdered<nonnegative_int>{
-                10_n,
-                12_n,
+            TensorDims{FFOrdered{
+                10_p,
+                12_p,
             }},
             DataType::FLOAT,
         };
@@ -132,9 +132,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         ComputationGraphBuilder b;
 
         TensorShape input_shape = TensorShape{
-            TensorDims{FFOrdered<nonnegative_int>{
-                10_n,
-                12_n,
+            TensorDims{FFOrdered{
+                10_p,
+                12_p,
             }},
             DataType::FLOAT,
         };
@@ -157,9 +157,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       ComputationGraph cg = make_empty_computation_graph();
 
       TensorShape input_shape = TensorShape{
-          TensorDims{FFOrdered<nonnegative_int>{
-              10_n,
-              12_n,
+          TensorDims{FFOrdered{
+              10_p,
+              12_p,
           }},
           DataType::FLOAT,
       };
@@ -172,7 +172,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       };
 
       LinearAttrs linear_attrs = LinearAttrs{
-          /*out_channels=*/14_n,
+          /*out_channels=*/14_p,
           /*use_bias=*/true,
           /*data_type=*/DataType::FLOAT,
           /*activation=*/Activation::RELU,
diff --git a/lib/pcg/test/src/pcg/computation_graph_builder.cc b/lib/pcg/test/src/pcg/computation_graph_builder.cc
index 98a4e2a241..f7430b3403 100644
--- a/lib/pcg/test/src/pcg/computation_graph_builder.cc
+++ b/lib/pcg/test/src/pcg/computation_graph_builder.cc
@@ -8,20 +8,20 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ComputationGraphBuilder") {
     ComputationGraphBuilder b;
 
-    nonnegative_int batch_size = 2_n;
+    positive_int batch_size = 2_p;
 
     TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{batch_size, 3_n, 10_n, 10_n}},
+        TensorDims{FFOrdered{batch_size, 3_p, 10_p, 10_p}},
         DataType::FLOAT,
     };
 
     tensor_guid_t input = b.create_input(input_shape, CreateGrad::YES);
     tensor_guid_t output = b.conv2d(input,
-                                    /*outChannels=*/5_n,
-                                    /*kernelH=*/3_n,
-                                    /*kernelW=*/3_n,
-                                    /*strideH=*/1_n,
-                                    /*strideW=*/1_n,
+                                    /*outChannels=*/5_p,
+                                    /*kernelH=*/3_p,
+                                    /*kernelW=*/3_p,
+                                    /*strideH=*/1_p,
+                                    /*strideW=*/1_p,
                                     /*paddingH=*/0_n,
                                     /*paddingW=*/0_n);
     // ComputationGraph cg = b.computation_graph;
diff --git a/lib/pcg/test/src/pcg/file_format/v1/v1_computation_graph.cc b/lib/pcg/test/src/pcg/file_format/v1/v1_computation_graph.cc
index 59c606adb1..7af3f648d9 100644
--- a/lib/pcg/test/src/pcg/file_format/v1/v1_computation_graph.cc
+++ b/lib/pcg/test/src/pcg/file_format/v1/v1_computation_graph.cc
@@ -10,15 +10,15 @@ TEST_SUITE(FF_TEST_SUITE) {
       ComputationGraphBuilder b;
 
       TensorShape input_shape = TensorShape{
-          TensorDims{FFOrdered<nonnegative_int>{
-              12_n,
-              16_n,
+          TensorDims{FFOrdered{
+              12_p,
+              16_p,
           }},
           DataType::FLOAT,
       };
 
       tensor_guid_t input = b.create_input(input_shape, CreateGrad::YES);
-      tensor_guid_t mm_output = b.dense(input, 8_n);
+      tensor_guid_t mm_output = b.dense(input, 8_p);
       tensor_guid_t relu_output = b.relu(mm_output);
 
       return b.computation_graph;
diff --git a/lib/pcg/test/src/pcg/file_format/v1/v1_parallel_computation_graph.cc b/lib/pcg/test/src/pcg/file_format/v1/v1_parallel_computation_graph.cc
index 9d5dceca18..ec6a4ab006 100644
--- a/lib/pcg/test/src/pcg/file_format/v1/v1_parallel_computation_graph.cc
+++ b/lib/pcg/test/src/pcg/file_format/v1/v1_parallel_computation_graph.cc
@@ -11,9 +11,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       TensorShape input_shape = TensorShape{
           TensorDims{
-              FFOrdered<nonnegative_int>{
-                  12_n,
-                  16_n,
+              FFOrdered{
+                  12_p,
+                  16_p,
               },
           },
           DataType::FLOAT,
@@ -21,8 +21,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
       parallel_tensor_guid_t t_partition =
-          b.parallel_partition(input, ff_dim_t{0_n}, 2_n);
-      parallel_tensor_guid_t mm_output = b.dense(input, 8_n);
+          b.parallel_partition(input, ff_dim_t{0_n}, 2_p);
+      parallel_tensor_guid_t mm_output = b.dense(input, 8_p);
       parallel_tensor_guid_t relu_output = b.relu(mm_output);
 
       return b.pcg;
diff --git a/lib/pcg/test/src/pcg/machine_specification.cc b/lib/pcg/test/src/pcg/machine_specification.cc
index 6d339350a0..4064f36679 100644
--- a/lib/pcg/test/src/pcg/machine_specification.cc
+++ b/lib/pcg/test/src/pcg/machine_specification.cc
@@ -8,9 +8,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
   TEST_CASE("MachineSpecification") {
     MachineSpecification ms = MachineSpecification{
-        /*num_nodes=*/4_n,
-        /*num_cpus_per_node=*/16_n,
-        /*num_gpus_per_node=*/8_n,
+        /*num_nodes=*/4_p,
+        /*num_cpus_per_node=*/16_p,
+        /*num_gpus_per_node=*/8_p,
         /*inter_node_bandwidth=*/0,
         /*intra_node_bandwidth=*/0,
     };
diff --git a/lib/pcg/test/src/pcg/machine_view.cc b/lib/pcg/test/src/pcg/machine_view.cc
index e286f08bf2..ecc196a118 100644
--- a/lib/pcg/test/src/pcg/machine_view.cc
+++ b/lib/pcg/test/src/pcg/machine_view.cc
@@ -13,9 +13,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     MachineView mv = MachineView{
         MachineSpaceCoordinate{
             /*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU},
-        {MachineViewDimension{stride_t{2_n},
+        {MachineViewDimension{stride_t{2_p},
                               MachineSpecificationDimension::INTER_NODE},
-         MachineViewDimension{stride_t{2_n},
+         MachineViewDimension{stride_t{2_p},
                               MachineSpecificationDimension::INTER_NODE}}};
 
     SUBCASE("num_dims") {
@@ -43,16 +43,16 @@ TEST_SUITE(FF_TEST_SUITE) {
        * Where the (x,) are the `TaskSpaceCoordinate`s, and the underlying grid
        * is the machine space.
        */
-      OperatorTaskSpace task = OperatorTaskSpace{{3_n}};
+      OperatorTaskSpace task = OperatorTaskSpace{{3_p}};
       MachineView mv = MachineView{
           MachineSpaceCoordinate{
               /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU},
-          {MachineViewDimension{stride_t{2_n},
+          {MachineViewDimension{stride_t{2_p},
                                 MachineSpecificationDimension::INTRA_NODE}}};
       MachineSpecification ms =
-          MachineSpecification{/*num_nodes=*/1_n,
-                               /*num_cpus_per_node=*/6_n,
-                               /*num_gpus_per_node=*/6_n,
+          MachineSpecification{/*num_nodes=*/1_p,
+                               /*num_cpus_per_node=*/6_p,
+                               /*num_gpus_per_node=*/6_p,
                                /*inter_node_bandwidth=*/0,
                                /*intra_node_bandwidth=*/0};
 
@@ -112,18 +112,18 @@ TEST_SUITE(FF_TEST_SUITE) {
          * grid is the machine space.
          */
 
-        OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n}};
+        OperatorTaskSpace task = OperatorTaskSpace{{2_p, 2_p}};
         MachineView mv = MachineView{
             MachineSpaceCoordinate{
                 /*node_idx=*/1_n, /*device_idx=*/2_n, DeviceType::GPU},
-            {MachineViewDimension{stride_t{1_n},
+            {MachineViewDimension{stride_t{1_p},
                                   MachineSpecificationDimension::INTER_NODE},
-             MachineViewDimension{stride_t{2_n},
+             MachineViewDimension{stride_t{2_p},
                                   MachineSpecificationDimension::INTRA_NODE}}};
         MachineSpecification ms =
-            MachineSpecification{/*num_nodes=*/3_n,
-                                 /*num_cpus_per_node=*/5_n,
-                                 /*num_gpus_per_node=*/5_n,
+            MachineSpecification{/*num_nodes=*/3_p,
+                                 /*num_cpus_per_node=*/5_p,
+                                 /*num_gpus_per_node=*/5_p,
                                  /*inter_node_bandwidth=*/0,
                                  /*intra_node_bandwidth=*/0};
 
@@ -179,18 +179,18 @@ TEST_SUITE(FF_TEST_SUITE) {
          * grid is the machine space.
          */
 
-        OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n}};
+        OperatorTaskSpace task = OperatorTaskSpace{{2_p, 2_p}};
         MachineView mv = MachineView{
             MachineSpaceCoordinate{
                 /*node_idx=*/1_n, /*device_idx=*/0_n, DeviceType::GPU},
-            {MachineViewDimension{stride_t{1_n},
+            {MachineViewDimension{stride_t{1_p},
                                   MachineSpecificationDimension::INTRA_NODE},
-             MachineViewDimension{stride_t{2_n},
+             MachineViewDimension{stride_t{2_p},
                                   MachineSpecificationDimension::INTRA_NODE}}};
         MachineSpecification ms =
-            MachineSpecification{/*num_nodes=*/2_n,
-                                 /*num_cpus_per_node=*/6_n,
-                                 /*num_gpus_per_node=*/6_n,
+            MachineSpecification{/*num_nodes=*/2_p,
+                                 /*num_cpus_per_node=*/6_p,
+                                 /*num_gpus_per_node=*/6_p,
                                  /*inter_node_bandwidth=*/0,
                                  /*intra_node_bandwidth=*/0};
 
@@ -253,20 +253,20 @@ TEST_SUITE(FF_TEST_SUITE) {
          * grid is the machine space.
          */
 
-        OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n, 2_n}};
+        OperatorTaskSpace task = OperatorTaskSpace{{2_p, 2_p, 2_p}};
         MachineView mv = MachineView{
             MachineSpaceCoordinate{
                 /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU},
-            {MachineViewDimension{stride_t{1_n},
+            {MachineViewDimension{stride_t{1_p},
                                   MachineSpecificationDimension::INTER_NODE},
-             MachineViewDimension{stride_t{2_n},
+             MachineViewDimension{stride_t{2_p},
                                   MachineSpecificationDimension::INTRA_NODE},
-             MachineViewDimension{stride_t{1_n},
+             MachineViewDimension{stride_t{1_p},
                                   MachineSpecificationDimension::INTRA_NODE}}};
         MachineSpecification ms =
-            MachineSpecification{/*num_nodes=*/2_n,
-                                 /*num_cpus_per_node=*/8_n,
-                                 /*num_gpus_per_node=*/8_n,
+            MachineSpecification{/*num_nodes=*/2_p,
+                                 /*num_cpus_per_node=*/8_p,
+                                 /*num_gpus_per_node=*/8_p,
                                  /*inter_node_bandwidth=*/0,
                                  /*intra_node_bandwidth=*/0};
 
@@ -319,17 +319,17 @@ TEST_SUITE(FF_TEST_SUITE) {
        * select
        */
       MachineSpecification ms =
-          MachineSpecification{/*num_nodes=*/1_n,
-                               /*num_cpus_per_node=*/6_n,
-                               /*num_gpus_per_node=*/6_n,
+          MachineSpecification{/*num_nodes=*/1_p,
+                               /*num_cpus_per_node=*/6_p,
+                               /*num_gpus_per_node=*/6_p,
                                /*inter_node_bandwidth=*/0,
                                /*intra_node_bandwidth=*/0};
 
-      OperatorTaskSpace task = OperatorTaskSpace{{3_n}};
+      OperatorTaskSpace task = OperatorTaskSpace{{3_p}};
       MachineView mv = MachineView{
           MachineSpaceCoordinate{
               /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU},
-          {MachineViewDimension{stride_t{2_n},
+          {MachineViewDimension{stride_t{2_p},
                                 MachineSpecificationDimension::INTRA_NODE}}};
 
       std::unordered_set<device_id_t> correct = {
@@ -364,19 +364,19 @@ TEST_SUITE(FF_TEST_SUITE) {
        */
 
       MachineSpecification ms =
-          MachineSpecification{/*num_nodes=*/3_n,
-                               /*num_cpus_per_node=*/5_n,
-                               /*num_gpus_per_node=*/5_n,
+          MachineSpecification{/*num_nodes=*/3_p,
+                               /*num_cpus_per_node=*/5_p,
+                               /*num_gpus_per_node=*/5_p,
                                /*inter_node_bandwidth=*/0,
                                /*intra_node_bandwidth=*/0};
 
-      OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n}};
+      OperatorTaskSpace task = OperatorTaskSpace{{2_p, 2_p}};
       MachineView mv = MachineView{
           MachineSpaceCoordinate{
               /*node_idx=*/1_n, /*device_idx=*/2_n, DeviceType::GPU},
-          {MachineViewDimension{stride_t{1_n},
+          {MachineViewDimension{stride_t{1_p},
                                 MachineSpecificationDimension::INTER_NODE},
-           MachineViewDimension{stride_t{2_n},
+           MachineViewDimension{stride_t{2_p},
                                 MachineSpecificationDimension::INTRA_NODE}}};
 
       std::unordered_set<device_id_t> correct = {
diff --git a/lib/pcg/test/src/pcg/operator_task_space.cc b/lib/pcg/test/src/pcg/operator_task_space.cc
index fa06af3635..4b01ed02fb 100644
--- a/lib/pcg/test/src/pcg/operator_task_space.cc
+++ b/lib/pcg/test/src/pcg/operator_task_space.cc
@@ -18,7 +18,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
     SUBCASE("OperatorTaskSpace has 2 dimensions") {
 
-      OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n}};
+      OperatorTaskSpace task = OperatorTaskSpace{{2_p, 2_p}};
 
       std::unordered_set<TaskSpaceCoordinate> correct = {{
           TaskSpaceCoordinate{{0_n, 0_n}},
@@ -32,7 +32,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
     SUBCASE("OperatorTaskSpace has 3 dimensions") {
 
-      OperatorTaskSpace task = OperatorTaskSpace{{1_n, 2_n, 2_n}};
+      OperatorTaskSpace task = OperatorTaskSpace{{1_p, 2_p, 2_p}};
 
       std::unordered_set<TaskSpaceCoordinate> correct = {{
           TaskSpaceCoordinate{{0_n, 0_n, 0_n}},
@@ -48,7 +48,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_task_space_maximum_coordinate") {
     SUBCASE("OperatorTaskSpace has 2 dimensions") {
 
-      OperatorTaskSpace task = OperatorTaskSpace{{3_n, 2_n}};
+      OperatorTaskSpace task = OperatorTaskSpace{{3_p, 2_p}};
 
       TaskSpaceCoordinate correct = TaskSpaceCoordinate{{2_n, 1_n}};
       TaskSpaceCoordinate result = get_task_space_maximum_coordinate(task);
@@ -56,7 +56,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
     SUBCASE("OperatorTaskSpace has 3 dimensions") {
 
-      OperatorTaskSpace task = OperatorTaskSpace{{3_n, 2_n, 4_n}};
+      OperatorTaskSpace task = OperatorTaskSpace{{3_p, 2_p, 4_p}};
 
       TaskSpaceCoordinate correct = TaskSpaceCoordinate{{2_n, 1_n, 3_n}};
       TaskSpaceCoordinate result = get_task_space_maximum_coordinate(task);
diff --git a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
index d68e20bd92..f223558868 100644
--- a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
+++ b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
@@ -28,9 +28,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
-                12_n,
-                16_n,
+            FFOrdered{
+                12_p,
+                16_p,
             },
         },
         DataType::FLOAT,
@@ -64,7 +64,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{10_n, 12_n},
+            FFOrdered{10_p, 12_p},
         },
         DataType::FLOAT,
     };
@@ -84,7 +84,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::string my_op_name = "my op";
 
       LinearAttrs linear_attrs = LinearAttrs{
-          /*out_channels=*/14_n,
+          /*out_channels=*/14_p,
           /*use_bias=*/true,
           /*data_type=*/DataType::FLOAT,
           /*activation=*/Activation::RELU,
@@ -134,9 +134,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       "get_source_layer(ParallelComputationGraph, parallel_tensor_guid_t)") {
     TensorShape input_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
-                10_n,
-                12_n,
+            FFOrdered{
+                10_p,
+                12_p,
             },
         },
         DataType::FLOAT,
@@ -205,9 +205,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       "get_incoming_weights(ParallelComputationGraph, parallel_layer_guid_t)") {
     TensorShape input_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
-                10_n,
-                12_n,
+            FFOrdered{
+                10_p,
+                12_p,
             },
         },
         DataType::FLOAT,
@@ -248,7 +248,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       ParallelComputationGraph pcg = empty_parallel_computation_graph();
 
       LinearAttrs linear_attrs = LinearAttrs{
-          /*out_channels=*/14_n,
+          /*out_channels=*/14_p,
           /*use_bias=*/false,
           /*data_type=*/DataType::FLOAT,
           /*activation=*/Activation::RELU,
@@ -261,7 +261,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       RepartitionAttrs partition_input_attrs = RepartitionAttrs{
           /*repartition_dim=*/ff_dim_t{0_n},
-          /*repartition_degree=*/2_n,
+          /*repartition_degree=*/2_p,
       };
 
       ParallelLayerAddedResult partition_input_added = add_parallel_layer(
@@ -281,7 +281,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           get_only(projection_weight_added.outputs);
 
       ReplicateAttrs replicate_projection_attrs = ReplicateAttrs{
-          /*replicate_degree=*/2_n,
+          /*replicate_degree=*/2_p,
       };
       ParallelLayerAddedResult replicate_projection_added =
           add_parallel_layer(pcg,
@@ -309,9 +309,9 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("pcg_add_input_layer") {
     TensorShape input_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
-                12_n,
-                10_n,
+            FFOrdered{
+                12_p,
+                10_p,
             },
         },
         DataType::FLOAT,
diff --git a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
index b82cb009a9..1682ac6254 100644
--- a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
+++ b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
@@ -26,14 +26,14 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ParallelComputationGraphBuilder::add") {
     ParallelComputationGraphBuilder b;
 
-    ShardParallelDim d1 = ShardParallelDim{10_n, 2_n};
-    ShardParallelDim d2 = ShardParallelDim{15_n, 3_n};
+    ShardParallelDim d1 = ShardParallelDim{10_p, 2_p};
+    ShardParallelDim d2 = ShardParallelDim{15_p, 3_p};
 
     TensorShape lhs_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
-                10_n,
-                15_n,
+            FFOrdered{
+                10_p,
+                15_p,
             },
         },
         DataType::FLOAT,
@@ -42,12 +42,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     // ParallelTensorShape lhs_shape = ParallelTensorShape{
     //     ParallelTensorDims{
     //         FFOrdered<ShardParallelDim>{
-    //             ShardParallelDim{10_n, 2_n},
-    //             ShardParallelDim{15_n, 3_n},
+    //             ShardParallelDim{10_p, 2_p},
+    //             ShardParallelDim{15_p, 3_p},
     //         },
     //         ReplicaParallelDimSet{
-    //             SumDegree{2_n},
-    //             DiscardCopyDegree{1_n},
+    //             SumDegree{2_p},
+    //             DiscardCopyDegree{1_p},
     //         },
     //     },
     //     DataType::FLOAT,
@@ -88,10 +88,10 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape a_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
-                4_n,
-                10_n,
-                15_n,
+            FFOrdered{
+                4_p,
+                10_p,
+                15_p,
             },
         },
         DataType::FLOAT,
@@ -99,10 +99,10 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape b_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
-                4_n,
-                15_n,
-                10_n,
+            FFOrdered{
+                4_p,
+                15_p,
+                10_p,
             },
         },
         DataType::FLOAT,
@@ -141,9 +141,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
-                10_n,
-                12_n,
+            FFOrdered{
+                10_p,
+                12_p,
             },
         },
         DataType::FLOAT,
@@ -176,24 +176,24 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ParallelComputationGraphBuilder::conv2d") {
     ParallelComputationGraphBuilder b;
 
-    nonnegative_int batch_size = 2_n;
+    positive_int batch_size = 2_p;
 
     TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{batch_size, 3_n, 10_n, 10_n}},
+        TensorDims{FFOrdered{batch_size, 3_p, 10_p, 10_p}},
         DataType::FLOAT,
     };
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
     parallel_tensor_guid_t par_input =
-        b.parallel_partition(input, ff_dim_t{0_n}, 2_n);
+        b.parallel_partition(input, ff_dim_t{0_n}, 2_p);
 
     ParallelTensorShape par_input_shape = b.get_shape(par_input);
 
-    nonnegative_int outChannels = 6_n;
-    nonnegative_int kernelH = 5_n;
-    nonnegative_int kernelW = 4_n;
-    nonnegative_int strideH = 3_n;
-    nonnegative_int strideW = 2_n;
+    positive_int outChannels = 6_p;
+    positive_int kernelH = 5_p;
+    positive_int kernelW = 4_p;
+    positive_int strideH = 3_p;
+    positive_int strideW = 2_p;
     nonnegative_int paddingH = 1_n;
     nonnegative_int paddingW = 0_n;
     parallel_tensor_guid_t output = b.conv2d(par_input,
@@ -252,7 +252,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         strideW,
         paddingH,
         paddingW,
-        /*groups=*/1_n,
+        /*groups=*/1_p,
         /*activation=*/std::nullopt,
         /*use_bias=*/true,
     };
@@ -298,14 +298,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
-                10_n,
-                16_n,
+            FFOrdered{
+                10_p,
+                16_p,
             },
         },
         DataType::FLOAT,
     };
-    nonnegative_int outDim = 14_n;
+    positive_int outDim = 14_p;
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
     parallel_tensor_guid_t output = b.dense(input,
@@ -336,9 +336,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
-                12_n,
-                10_n,
+            FFOrdered{
+                12_p,
+                10_p,
             },
         },
         DataType::INT32,
@@ -346,8 +346,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
     parallel_tensor_guid_t output = b.embedding(input,
-                                                /*num_entries=*/32_n,
-                                                /*outDim=*/8_n,
+                                                /*num_entries=*/32_p,
+                                                /*outDim=*/8_p,
                                                 AggregateOp::SUM,
                                                 DataType::FLOAT);
     parallel_layer_guid_t layer = get_source_layer(output);
@@ -373,10 +373,10 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape query_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
-                12_n,
-                16_n,
-                10_n,
+            FFOrdered{
+                12_p,
+                16_p,
+                10_p,
             },
         },
         DataType::FLOAT,
@@ -385,8 +385,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorShape key_shape = query_shape;
     TensorShape value_shape = query_shape;
 
-    nonnegative_int embed_dim = 8_n;
-    nonnegative_int num_heads = 6_n;
+    positive_int embed_dim = 8_p;
+    positive_int num_heads = 6_p;
 
     parallel_tensor_guid_t query = b.create_input_tensor(query_shape);
     parallel_tensor_guid_t key = b.create_input_tensor(key_shape);
@@ -417,9 +417,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
-                18_n,
-                32_n,
+            FFOrdered{
+                18_p,
+                32_p,
             },
         },
         DataType::FLOAT,
@@ -447,14 +447,14 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ParallelComputationGraphBuilder::parallel_partition") {
     ParallelComputationGraphBuilder b;
 
-    ShardParallelDim batch_dim = ShardParallelDim{18_n, 2_n};
-    ShardParallelDim feature_dim = ShardParallelDim{10_n, 1_n};
+    ShardParallelDim batch_dim = ShardParallelDim{18_p, 2_p};
+    ShardParallelDim feature_dim = ShardParallelDim{10_p, 1_p};
 
     TensorShape input_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
-                18_n,
-                10_n,
+            FFOrdered{
+                18_p,
+                10_p,
             },
         },
         DataType::FLOAT,
@@ -462,7 +462,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
     parallel_tensor_guid_t output =
-        b.parallel_partition(input, ff_dim_t{nonnegative_int{0}}, 2_n);
+        b.parallel_partition(input, ff_dim_t{0_n}, 2_p);
     parallel_layer_guid_t layer = get_source_layer(output);
 
     SUBCASE("incoming") {
@@ -485,18 +485,18 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
-                18_n,
-                10_n,
+            FFOrdered{
+                18_p,
+                10_p,
             },
         },
         DataType::FLOAT,
     };
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
-    input = b.parallel_partition(input, ff_dim_t{0_n}, 2_n);
+    input = b.parallel_partition(input, ff_dim_t{0_n}, 2_p);
     parallel_tensor_guid_t output =
-        b.parallel_combine(input, ff_dim_t{0_n}, 2_n);
+        b.parallel_combine(input, ff_dim_t{0_n}, 2_p);
     parallel_layer_guid_t layer = get_source_layer(output);
 
     SUBCASE("incoming") {
@@ -519,16 +519,16 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
-                18_n,
-                10_n,
+            FFOrdered{
+                18_p,
+                10_p,
             },
         },
         DataType::FLOAT,
     };
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
-    parallel_tensor_guid_t output = b.parallel_replicate(input, 2_n);
+    parallel_tensor_guid_t output = b.parallel_replicate(input, 2_p);
     parallel_layer_guid_t layer = get_source_layer(output);
 
     SUBCASE("incoming") {
@@ -551,21 +551,21 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
-                18_n,
-                10_n,
+            FFOrdered{
+                18_p,
+                10_p,
             },
         },
         DataType::FLOAT,
     };
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
-    input = b.parallel_partition(input, ff_dim_t{1_n}, 2_n);
+    input = b.parallel_partition(input, ff_dim_t{1_n}, 2_p);
     input = b.dense(input,
-                    /*out_dim=*/12_n,
+                    /*out_dim=*/12_p,
                     /*activation=*/std::nullopt,
                     /*use_bias=*/false);
-    parallel_tensor_guid_t output = b.parallel_reduce(input, 2_n);
+    parallel_tensor_guid_t output = b.parallel_reduce(input, 2_p);
     parallel_layer_guid_t layer = get_source_layer(output);
 
     SUBCASE("incoming") {
diff --git a/lib/pcg/test/src/pcg/pcg_from_computation_graph.cc b/lib/pcg/test/src/pcg/pcg_from_computation_graph.cc
index 5a8f5fcd19..d037d64672 100644
--- a/lib/pcg/test/src/pcg/pcg_from_computation_graph.cc
+++ b/lib/pcg/test/src/pcg/pcg_from_computation_graph.cc
@@ -14,16 +14,16 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
-                10_n,
-                12_n,
+            FFOrdered{
+                10_p,
+                12_p,
             },
         },
         DataType::FLOAT,
     };
 
     LinearAttrs linear_attrs = LinearAttrs{
-        /*out_channels=*/8_n,
+        /*out_channels=*/8_p,
         /*use_bias=*/true,
         /*data_type=*/DataType::FLOAT,
         /*activation=*/Activation::RELU,
diff --git a/lib/pcg/test/src/pcg/start_invariant_machine_view.cc b/lib/pcg/test/src/pcg/start_invariant_machine_view.cc
index 71c4d1b1d0..afd6ad6b33 100644
--- a/lib/pcg/test/src/pcg/start_invariant_machine_view.cc
+++ b/lib/pcg/test/src/pcg/start_invariant_machine_view.cc
@@ -8,9 +8,9 @@ using namespace FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("StartInvariantMachineView - utility functions") {
     StartInvariantMachineView simv = StartInvariantMachineView{
-        {MachineViewDimension{stride_t{2_n},
+        {MachineViewDimension{stride_t{2_p},
                               MachineSpecificationDimension::INTER_NODE},
-         MachineViewDimension{stride_t{2_n},
+         MachineViewDimension{stride_t{2_p},
                               MachineSpecificationDimension::INTER_NODE}},
         DeviceType::GPU};
 
@@ -28,7 +28,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("get_strides") {
       std::vector<stride_t> result = get_strides(simv);
-      std::vector<stride_t> correct = {stride_t{2_n}, stride_t{2_n}};
+      std::vector<stride_t> correct = {stride_t{2_p}, stride_t{2_p}};
       CHECK(result == correct);
     }
 
@@ -45,9 +45,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     MachineSpaceCoordinate start =
         MachineSpaceCoordinate{1_n, 2_n, DeviceType::GPU};
     std::vector<MachineViewDimension> dimensions = {
-        MachineViewDimension{stride_t{2_n},
+        MachineViewDimension{stride_t{2_p},
                              MachineSpecificationDimension::INTER_NODE},
-        MachineViewDimension{stride_t{3_n},
+        MachineViewDimension{stride_t{3_p},
                              MachineSpecificationDimension::INTRA_NODE}};
 
     MachineView mv = MachineView{start, dimensions};
@@ -94,15 +94,15 @@ TEST_SUITE(FF_TEST_SUITE) {
        *  | (0,)  |       | (1,)  |       | (2,)  |       |
        *  +-------+-------+-------+-------+-------+-------+
        */
-      OperatorTaskSpace task = OperatorTaskSpace{{3_n}};
+      OperatorTaskSpace task = OperatorTaskSpace{{3_p}};
       StartInvariantMachineView simv = StartInvariantMachineView{
-          {MachineViewDimension{stride_t{2_n},
+          {MachineViewDimension{stride_t{2_p},
                                 MachineSpecificationDimension::INTRA_NODE}},
           DeviceType::GPU};
       MachineSpecification ms =
-          MachineSpecification{/*num_nodes=*/1_n,
-                               /*num_cpus_per_node=*/6_n,
-                               /*num_gpus_per_node=*/6_n,
+          MachineSpecification{/*num_nodes=*/1_p,
+                               /*num_cpus_per_node=*/6_p,
+                               /*num_gpus_per_node=*/6_p,
                                /*inter_node_bandwidth=*/0.0,
                                /*intra_node_bandwidth=*/0.0};
 
@@ -162,17 +162,17 @@ TEST_SUITE(FF_TEST_SUITE) {
        *  +-------+-------+-------+-------+
        */
 
-      OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n}};
+      OperatorTaskSpace task = OperatorTaskSpace{{2_p, 2_p}};
       StartInvariantMachineView simv = StartInvariantMachineView{
-          {MachineViewDimension{stride_t{1_n},
+          {MachineViewDimension{stride_t{1_p},
                                 MachineSpecificationDimension::INTER_NODE},
-           MachineViewDimension{stride_t{2_n},
+           MachineViewDimension{stride_t{2_p},
                                 MachineSpecificationDimension::INTRA_NODE}},
           DeviceType::GPU};
       MachineSpecification ms =
-          MachineSpecification{/*num_nodes=*/2_n,
-                               /*num_cpus_per_node=*/4_n,
-                               /*num_gpus_per_node=*/4_n,
+          MachineSpecification{/*num_nodes=*/2_p,
+                               /*num_cpus_per_node=*/4_p,
+                               /*num_gpus_per_node=*/4_p,
                                /*inter_node_bandwidth=*/0,
                                /*intra_node_bandwidth=*/0};
 
diff --git a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_value.variant.toml b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_value.variant.toml
index 3312b292a0..1994d54f38 100644
--- a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_value.variant.toml
+++ b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_value.variant.toml
@@ -21,6 +21,7 @@ includes = [
   "op-attrs/datatype.dtg.h",
   "<cstddef>",
   "utils/nonnegative_int/nonnegative_int.h",
+  "utils/positive_int/positive_int.h",
 ]
 
 src_includes = [
@@ -33,6 +34,9 @@ src_includes = [
 [[values]]
 type = "::FlexFlow::nonnegative_int"
 
+[[values]]
+type = "::FlexFlow::positive_int"
+
 [[values]]
 type = "bool"
 
@@ -43,7 +47,7 @@ type = "float"
 type = "std::optional<float>"
 
 [[values]]
-type = "std::vector<::FlexFlow::nonnegative_int>"
+type = "std::vector<::FlexFlow::positive_int>"
 
 [[values]]
 type = "std::vector<::FlexFlow::ff_dim_t>"
diff --git a/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc b/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc
index 4f11b343f8..1568b73162 100644
--- a/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc
+++ b/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc
@@ -33,12 +33,12 @@ PCGOperatorAttrs materialize_operator_from_attrs_map(
   switch (op_type) {
     case OperatorType::MULTIHEAD_ATTENTION:
       return PCGOperatorAttrs{MultiHeadAttentionAttrs{
-          /*embed_dim=*/acc.get<nonnegative_int>(
+          /*embed_dim=*/acc.get<positive_int>(
               OperatorAttributeKey::EMBED_DIM),
           /*num_heads=*/
-          acc.get<nonnegative_int>(OperatorAttributeKey::NUM_HEADS),
-          /*kdim=*/acc.get<nonnegative_int>(OperatorAttributeKey::KDIM),
-          /*vdim=*/acc.get<nonnegative_int>(OperatorAttributeKey::VDIM),
+          acc.get<positive_int>(OperatorAttributeKey::NUM_HEADS),
+          /*kdim=*/acc.get<positive_int>(OperatorAttributeKey::KDIM),
+          /*vdim=*/acc.get<positive_int>(OperatorAttributeKey::VDIM),
           /*dropout=*/acc.get<float>(OperatorAttributeKey::DROPOUT),
           /*bias=*/acc.get<bool>(OperatorAttributeKey::BIAS),
           /*add_bias_kv=*/acc.get<bool>(OperatorAttributeKey::ADD_BIAS_KV),
@@ -46,10 +46,10 @@ PCGOperatorAttrs materialize_operator_from_attrs_map(
       }};
     case OperatorType::POOL2D:
       return PCGOperatorAttrs{Pool2DAttrs{
-          /*kernel_h=*/acc.get<nonnegative_int>(OperatorAttributeKey::KERNEL_H),
-          /*kernel_w=*/acc.get<nonnegative_int>(OperatorAttributeKey::KERNEL_W),
-          /*stride_h=*/acc.get<nonnegative_int>(OperatorAttributeKey::STRIDE_H),
-          /*stride_w=*/acc.get<nonnegative_int>(OperatorAttributeKey::STRIDE_W),
+          /*kernel_h=*/acc.get<positive_int>(OperatorAttributeKey::KERNEL_H),
+          /*kernel_w=*/acc.get<positive_int>(OperatorAttributeKey::KERNEL_W),
+          /*stride_h=*/acc.get<positive_int>(OperatorAttributeKey::STRIDE_H),
+          /*stride_w=*/acc.get<positive_int>(OperatorAttributeKey::STRIDE_W),
           /*padding_h=*/
           acc.get<nonnegative_int>(OperatorAttributeKey::PADDING_H),
           /*padding_w=*/
@@ -66,7 +66,7 @@ PCGOperatorAttrs materialize_operator_from_attrs_map(
     case OperatorType::DROPOUT:
     case OperatorType::LINEAR:
       return PCGOperatorAttrs{LinearAttrs{
-          /*out_channels=*/acc.get<nonnegative_int>(
+          /*out_channels=*/acc.get<positive_int>(
               OperatorAttributeKey::OUT_CHANNELS),
           /*use_bias=*/acc.get<bool>(OperatorAttributeKey::USE_BIAS),
           /*data_type=*/acc.get<DataType>(OperatorAttributeKey::DATA_TYPE),
diff --git a/lib/substitutions/test/src/substitutions/apply_substitution/apply_substitution.cc b/lib/substitutions/test/src/substitutions/apply_substitution/apply_substitution.cc
index ad78695fbb..05fd1a3fc9 100644
--- a/lib/substitutions/test/src/substitutions/apply_substitution/apply_substitution.cc
+++ b/lib/substitutions/test/src/substitutions/apply_substitution/apply_substitution.cc
@@ -62,15 +62,15 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     Substitution sub = b.get_substitution();
 
-    nonnegative_int in_channels = 24_n;
-    nonnegative_int batch_size = 4_n;
-    nonnegative_int batch_degree = 2_n;
+    positive_int in_channels = 24_p;
+    positive_int batch_size = 4_p;
+    positive_int batch_degree = 2_p;
     std::string mm_match = "mm_match";
     std::string relu_match = "relu_match";
 
     TensorShape input_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
+            FFOrdered{
                 batch_size,
                 in_channels,
             },
@@ -84,11 +84,11 @@ TEST_SUITE(FF_TEST_SUITE) {
       parallel_tensor_guid_t t = b.create_input_tensor(input_shape);
       t = b.parallel_partition(t, ff_dim_t{0_n}, batch_degree);
       t = b.dense(t,
-                  /*outDim=*/16_n,
+                  /*outDim=*/16_p,
                   /*activation=*/std::nullopt);
       t = b.gelu(t);
       t = b.dense(t,
-                  /*outDim=*/12_n,
+                  /*outDim=*/12_p,
                   /*activation=*/std::nullopt,
                   /*use_bias=*/false,
                   /*data_type=*/DataType::FLOAT,
@@ -98,7 +98,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       t = b.relu(t,
                  /*name=*/relu_match);
       t = b.dense(t,
-                  /*outDim=*/8_n,
+                  /*outDim=*/8_p,
                   /*activation=*/Activation::RELU);
 
       return sub_pcg_from_full_pcg(b.pcg);
@@ -138,11 +138,11 @@ TEST_SUITE(FF_TEST_SUITE) {
       parallel_tensor_guid_t t = b.create_input_tensor(input_shape);
       t = b.parallel_partition(t, ff_dim_t{0_n}, batch_degree);
       t = b.dense(t,
-                  /*outDim=*/16_n,
+                  /*outDim=*/16_p,
                   /*activation=*/std::nullopt);
       t = b.gelu(t);
       t = b.dense(t,
-                  /*outDim=*/12_n,
+                  /*outDim=*/12_p,
                   /*activation=*/Activation::RELU,
                   /*use_bias=*/false,
                   /*data_type=*/DataType::FLOAT,
@@ -150,7 +150,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                   /*bias_initializer=*/std::nullopt,
                   /*name=*/std::nullopt);
       t = b.dense(t,
-                  /*outDim=*/8_n,
+                  /*outDim=*/8_p,
                   /*activation=*/Activation::RELU);
 
       return sub_pcg_from_full_pcg(b.pcg);
diff --git a/lib/substitutions/test/src/substitutions/apply_substitution/evaluate_substitution_output.cc b/lib/substitutions/test/src/substitutions/apply_substitution/evaluate_substitution_output.cc
index 75bbbcae9e..7419c62965 100644
--- a/lib/substitutions/test/src/substitutions/apply_substitution/evaluate_substitution_output.cc
+++ b/lib/substitutions/test/src/substitutions/apply_substitution/evaluate_substitution_output.cc
@@ -111,15 +111,15 @@ TEST_SUITE(FF_TEST_SUITE) {
         },
     };
 
-    nonnegative_int in_channels = 24_n;
-    nonnegative_int batch_size = 4_n;
-    nonnegative_int batch_degree = 2_n;
+    positive_int in_channels = 24_p;
+    positive_int batch_size = 4_p;
+    positive_int batch_degree = 2_p;
     std::string mm_match = "mm_match";
     std::string relu_match = "relu_match";
 
     TensorShape input_shape = TensorShape{
         TensorDims{
-            FFOrdered<nonnegative_int>{
+            FFOrdered{
                 batch_size,
                 in_channels,
             },
@@ -133,11 +133,11 @@ TEST_SUITE(FF_TEST_SUITE) {
       parallel_tensor_guid_t t = b.create_input_tensor(input_shape);
       t = b.parallel_partition(t, ff_dim_t{0_n}, batch_degree);
       t = b.dense(t,
-                  /*outDim=*/16_n,
+                  /*outDim=*/16_p,
                   /*activation=*/std::nullopt);
       t = b.gelu(t);
       t = b.dense(t,
-                  /*outDim=*/12_n,
+                  /*outDim=*/12_p,
                   /*activation=*/std::nullopt,
                   /*use_bias=*/false,
                   /*data_type=*/DataType::FLOAT,
@@ -147,7 +147,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       t = b.relu(t,
                  /*name=*/relu_match);
       t = b.dense(t,
-                  /*outDim=*/8_n,
+                  /*outDim=*/8_p,
                   /*activation=*/Activation::RELU);
 
       return sub_pcg_from_full_pcg(b.pcg);
@@ -189,7 +189,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           result_input_map = result.second.input_mapping;
 
       LinearAttrs correct_result_fused_mm_relu_attrs = LinearAttrs{
-          /*out_channels=*/12_n,
+          /*out_channels=*/12_p,
           /*use_bias=*/false,
           /*data_type=*/DataType::FLOAT,
           /*activation=*/Activation::RELU,
diff --git a/lib/substitutions/test/src/substitutions/apply_substitution/perform_shape_inference.cc b/lib/substitutions/test/src/substitutions/apply_substitution/perform_shape_inference.cc
index 9b8e526c08..2bf72d3224 100644
--- a/lib/substitutions/test/src/substitutions/apply_substitution/perform_shape_inference.cc
+++ b/lib/substitutions/test/src/substitutions/apply_substitution/perform_shape_inference.cc
@@ -18,21 +18,21 @@ TEST_SUITE(FF_TEST_SUITE) {
             UnorderedSetLabelledOpenDataflowGraph<ParallelLayerAttrs,
                                                   std::monostate>>();
 
-    nonnegative_int in_channels = 24_n;
-    nonnegative_int out_channels = 16_n;
-    nonnegative_int batch_size = 4_n;
-    nonnegative_int batch_degree = 2_n;
+    positive_int in_channels = 24_p;
+    positive_int out_channels = 16_p;
+    positive_int batch_size = 4_p;
+    positive_int batch_degree = 2_p;
 
     DataflowGraphInput i0 = g.add_input({});
     ParallelTensorShape i0_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
                 ShardParallelDim{batch_size, batch_degree},
-                ShardParallelDim{in_channels, 1_n},
+                ShardParallelDim{in_channels, 1_p},
             },
             ReplicaParallelDimSet{
-                SumDegree{1_n},
-                DiscardCopyDegree{1_n},
+                SumDegree{1_p},
+                DiscardCopyDegree{1_p},
             },
         },
         DataType::FLOAT,
diff --git a/lib/substitutions/test/src/substitutions/operator_pattern/get_attribute.cc b/lib/substitutions/test/src/substitutions/operator_pattern/get_attribute.cc
index 24f9e9bd56..5dcfda0ca7 100644
--- a/lib/substitutions/test/src/substitutions/operator_pattern/get_attribute.cc
+++ b/lib/substitutions/test/src/substitutions/operator_pattern/get_attribute.cc
@@ -6,7 +6,7 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_attribute(LinearAttrs, OperatorAttributeKey)") {
-    nonnegative_int out_channels = 16_n;
+    positive_int out_channels = 16_p;
     bool use_bias = true;
     std::optional<Activation> activation = Activation::GELU;
     std::optional<RegularizerAttrs> regularizer = RegularizerAttrs{
diff --git a/lib/substitutions/test/src/substitutions/pcg_pattern.cc b/lib/substitutions/test/src/substitutions/pcg_pattern.cc
index 4dbf0885dd..f4d430077f 100644
--- a/lib/substitutions/test/src/substitutions/pcg_pattern.cc
+++ b/lib/substitutions/test/src/substitutions/pcg_pattern.cc
@@ -16,13 +16,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("simple case") {
       ParallelComputationGraphBuilder builder;
 
-      nonnegative_int batch_size = 16_n;
-      nonnegative_int batch_degree = 2_n;
-      nonnegative_int num_channels = 24_n;
+      positive_int batch_size = 16_p;
+      positive_int batch_degree = 2_p;
+      positive_int num_channels = 24_p;
 
       TensorShape a_shape = TensorShape{
           TensorDims{
-              FFOrdered<nonnegative_int>{
+              FFOrdered{
                   batch_size,
                   num_channels,
               },
@@ -36,7 +36,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       a_tensor =
           builder.parallel_partition(a_tensor, ff_dim_t{0_n}, batch_degree);
 
-      nonnegative_int outDim = 16_n;
+      positive_int outDim = 16_p;
       std::string x_matmul_name = "x_matmul";
       std::string y_matmul_name = "y_matmul";
       parallel_tensor_guid_t t0 =
@@ -159,13 +159,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("pcg is a chain") {
       ParallelComputationGraphBuilder builder;
 
-      nonnegative_int batch_size = 16_n;
-      nonnegative_int batch_degree = 2_n;
-      nonnegative_int num_channels = 24_n;
+      positive_int batch_size = 16_p;
+      positive_int batch_degree = 2_p;
+      positive_int num_channels = 24_p;
 
       TensorShape a_shape = TensorShape{
           TensorDims{
-              FFOrdered<nonnegative_int>{
+              FFOrdered{
                   batch_size,
                   num_channels,
               },
@@ -179,7 +179,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       a_tensor =
           builder.parallel_partition(a_tensor, ff_dim_t{0_n}, batch_degree);
 
-      nonnegative_int outDim = 16_n;
+      positive_int outDim = 16_p;
       std::string x_matmul_name = "x_matmul";
       std::string y_matmul_name = "y_matmul";
       parallel_tensor_guid_t t0 =
diff --git a/lib/substitutions/test/src/substitutions/unity_substitution_set.cc b/lib/substitutions/test/src/substitutions/unity_substitution_set.cc
index 804fa99bef..c86cb7e51f 100644
--- a/lib/substitutions/test/src/substitutions/unity_substitution_set.cc
+++ b/lib/substitutions/test/src/substitutions/unity_substitution_set.cc
@@ -6,9 +6,9 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_substitution_set") {
     MachineSpecification machine_spec = MachineSpecification{
-        /*num_nodes=*/2_n,
-        /*num_cpus_per_node=*/8_n,
-        /*num_gpus_per_node=*/4_n,
+        /*num_nodes=*/2_p,
+        /*num_cpus_per_node=*/8_p,
+        /*num_gpus_per_node=*/4_p,
         /*inter_node_bandwidth=*/0.0,
         /*intra_node_bandwidth=*/0.0,
     };
diff --git a/lib/task-spec/CMakeLists.txt b/lib/task-spec/CMakeLists.txt
index 8deb20a593..8ccd8312cb 100644
--- a/lib/task-spec/CMakeLists.txt
+++ b/lib/task-spec/CMakeLists.txt
@@ -14,3 +14,5 @@ ff_add_library(
     pcg
     spdlog
 )
+
+add_subdirectory(test)
diff --git a/lib/task-spec/src/task-spec/ops/attention.cc b/lib/task-spec/src/task-spec/ops/attention.cc
index 01960803ce..488517a02e 100644
--- a/lib/task-spec/src/task-spec/ops/attention.cc
+++ b/lib/task-spec/src/task-spec/ops/attention.cc
@@ -85,10 +85,10 @@ static DeviceSpecificDeviceStates
     init_task_impl(TaskArgumentAccessor const &acc) {
   auto const &attrs = acc.get_argument<MultiHeadAttentionAttrs>(ATTRS);
   Allocator allocator = acc.get_allocator();
-  nonnegative_int qProjSize = acc.get_argument<nonnegative_int>(QPROJSIZE);
-  nonnegative_int kProjSize = acc.get_argument<nonnegative_int>(KPROJSIZE);
-  nonnegative_int vProjSize = acc.get_argument<nonnegative_int>(VPROJSIZE);
-  nonnegative_int oProjSize = acc.get_argument<nonnegative_int>(OPROJSIZE);
+  positive_int qProjSize = acc.get_argument<positive_int>(QPROJSIZE);
+  positive_int kProjSize = acc.get_argument<positive_int>(KPROJSIZE);
+  positive_int vProjSize = acc.get_argument<positive_int>(VPROJSIZE);
+  positive_int oProjSize = acc.get_argument<positive_int>(OPROJSIZE);
 
   PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
   ParallelTensorShape query_parallel_tensor_shape =
@@ -108,29 +108,29 @@ static DeviceSpecificDeviceStates
                                             key_parallel_tensor_shape,
                                             value_parallel_tensor_shape));
 
-  nonnegative_int kvSeqLength = get_kvSeqLength(parsed);
-  nonnegative_int qSize = get_qSize(parsed);
-  nonnegative_int kSize = get_kSize(parsed);
-  nonnegative_int vSize = get_vSize(parsed);
+  positive_int kvSeqLength = get_kvSeqLength(parsed);
+  positive_int qSize = get_qSize(parsed);
+  positive_int kSize = get_kSize(parsed);
+  positive_int vSize = get_vSize(parsed);
 
-  nonnegative_int qoSeqLength = get_qoSeqLength(parsed);
-  nonnegative_int num_samples = get_num_samples(parsed);
-  nonnegative_int num_heads = attrs.num_heads;
+  positive_int qoSeqLength = get_qoSeqLength(parsed);
+  positive_int num_samples = get_num_samples(parsed);
+  positive_int num_heads = attrs.num_heads;
 
   MHAPerDeviceState per_device_state =
       init_kernel(handle,
                   allocator,
-                  num_samples.unwrap_nonnegative(),
-                  num_heads.unwrap_nonnegative(),
-                  qSize.unwrap_nonnegative(),
-                  kSize.unwrap_nonnegative(),
-                  vSize.unwrap_nonnegative(),
-                  qProjSize.unwrap_nonnegative(),
-                  kProjSize.unwrap_nonnegative(),
-                  vProjSize.unwrap_nonnegative(),
-                  oProjSize.unwrap_nonnegative(),
-                  qoSeqLength.unwrap_nonnegative(),
-                  kvSeqLength.unwrap_nonnegative(),
+                  num_samples.int_from_positive_int(),
+                  num_heads.int_from_positive_int(),
+                  qSize.int_from_positive_int(),
+                  kSize.int_from_positive_int(),
+                  vSize.int_from_positive_int(),
+                  qProjSize.int_from_positive_int(),
+                  kProjSize.int_from_positive_int(),
+                  vProjSize.int_from_positive_int(),
+                  oProjSize.int_from_positive_int(),
+                  qoSeqLength.int_from_positive_int(),
+                  kvSeqLength.int_from_positive_int(),
                   attrs.add_bias_kv);
   return DeviceSpecificDeviceStates{
       DeviceSpecific<MHAPerDeviceState>::create(per_device_state)};
@@ -185,7 +185,7 @@ static std::optional<float>
   assert(key_grad.shape == key.shape);
 
   assert(query_grad.shape == query.shape);
-  assert(weight_grad.shape.get_volume() == weight.shape.get_volume());
+  assert(weight_grad.shape.num_elements() == weight.shape.num_elements());
 
   return profile(backward_kernel,
                  profiling,
@@ -217,10 +217,10 @@ OpTaskSignature get_attention_init_signature() {
   init.add_arg_slot<ParallelTensorShape>(QUERY_PARALLEL_TENSOR_SHAPE);
   init.add_arg_slot<ParallelTensorShape>(KEY_PARALLEL_TENSOR_SHAPE);
   init.add_arg_slot<ParallelTensorShape>(VALUE_PARALLEL_TENSOR_SHAPE);
-  init.add_arg_slot<int>(QPROJSIZE);
-  init.add_arg_slot<int>(KPROJSIZE);
-  init.add_arg_slot<int>(VPROJSIZE);
-  init.add_arg_slot<int>(OPROJSIZE);
+  init.add_arg_slot<positive_int>(QPROJSIZE);
+  init.add_arg_slot<positive_int>(KPROJSIZE);
+  init.add_arg_slot<positive_int>(VPROJSIZE);
+  init.add_arg_slot<positive_int>(OPROJSIZE);
   init.add_arg_slot<MultiHeadAttentionAttrs>(ATTRS);
   init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
 
diff --git a/lib/task-spec/src/task-spec/ops/batch_matmul.cc b/lib/task-spec/src/task-spec/ops/batch_matmul.cc
index 371c80d7e2..1ee9da82d3 100644
--- a/lib/task-spec/src/task-spec/ops/batch_matmul.cc
+++ b/lib/task-spec/src/task-spec/ops/batch_matmul.cc
@@ -66,21 +66,21 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   FFIterationConfig iter_config =
       acc.get_argument<FFIterationConfig>(ITERATION_CONFIG);
 
-  nonnegative_int m = b_input.shape.at(legion_dim_t{0_n});
-  assert(m == output.shape.at(legion_dim_t{0_n}));
-  nonnegative_int n = a_input.shape.at(legion_dim_t{1_n});
-  assert(n == output.shape.at(legion_dim_t{1_n}));
-  nonnegative_int k = a_input.shape.at(legion_dim_t{0_n});
-  assert(k == b_input.shape.at(legion_dim_t{1_n}));
-
-  assert(a_input.shape.get_volume() == b_input.shape.get_volume());
-  assert(a_input.shape.get_volume() == output.shape.get_volume());
-
-  nonnegative_int batch = 1_n;
-  for (nonnegative_int i : nonnegative_range(2_n, a_input.shape.get_dim())) {
-    nonnegative_int dim_size = a_input.shape.at(legion_dim_t{i});
-    assert(dim_size == b_input.shape.at(legion_dim_t{i}));
-    assert(dim_size == output.shape.at(legion_dim_t{i}));
+  positive_int m = b_input.shape.at(legion_dim_t{0_n});
+  ASSERT(m == output.shape.at(legion_dim_t{0_n}));
+  positive_int n = a_input.shape.at(legion_dim_t{1_n});
+  ASSERT(n == output.shape.at(legion_dim_t{1_n}));
+  positive_int k = a_input.shape.at(legion_dim_t{0_n});
+  ASSERT(k == b_input.shape.at(legion_dim_t{1_n}));
+
+  ASSERT(a_input.shape.num_elements() == b_input.shape.num_elements());
+  ASSERT(a_input.shape.num_elements() == output.shape.num_elements());
+
+  positive_int batch = 1_p;
+  for (nonnegative_int i : nonnegative_range(2_n, a_input.shape.num_dims())) {
+    positive_int dim_size = a_input.shape.at(legion_dim_t{i});
+    ASSERT(dim_size == b_input.shape.at(legion_dim_t{i}));
+    ASSERT(dim_size == output.shape.at(legion_dim_t{i}));
     batch *= dim_size;
   }
 
@@ -97,10 +97,10 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  output.get_float_ptr(),
                  a_input.get_float_ptr(),
                  b_input.get_float_ptr(),
-                 m.unwrap_nonnegative(),
-                 n.unwrap_nonnegative(),
-                 k.unwrap_nonnegative(),
-                 batch.unwrap_nonnegative(),
+                 m.int_from_positive_int(),
+                 n.int_from_positive_int(),
+                 k.int_from_positive_int(),
+                 batch.int_from_positive_int(),
                  get_raw_seq_len(attrs.a_seq_length_dim),
                  get_raw_seq_len(attrs.b_seq_length_dim),
                  iter_config.seq_length);
@@ -116,31 +116,31 @@ static std::optional<float>
 
   auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RW>(OUTPUT);
-  assert(output.shape == output_grad.shape);
+  ASSERT(output.shape == output_grad.shape);
 
   auto a_input = acc.get_tensor<Permissions::RO>(A_INPUT);
   auto a_input_grad = acc.get_tensor_grad<Permissions::RW>(A_INPUT);
-  assert(a_input.shape == a_input_grad.shape);
+  ASSERT(a_input.shape == a_input_grad.shape);
 
   auto b_input = acc.get_tensor<Permissions::RO>(B_INPUT);
   auto b_input_grad = acc.get_tensor_grad<Permissions::RW>(B_INPUT);
-  assert(b_input.shape == b_input_grad.shape);
+  ASSERT(b_input.shape == b_input_grad.shape);
 
   // check dins
-  nonnegative_int m = b_input.shape.at(legion_dim_t{0_n});
-  assert(m == output.shape.at(legion_dim_t{0_n}));
-  nonnegative_int n = a_input.shape.at(legion_dim_t{1_n});
-  assert(n == output.shape.at(legion_dim_t{1_n}));
-  nonnegative_int k = a_input.shape.at(legion_dim_t{0_n});
-  assert(k == b_input.shape.at(legion_dim_t{1_n}));
-  assert(a_input.shape.get_volume() == b_input.shape.get_volume());
-  assert(a_input.shape.get_volume() == output.shape.get_volume());
-
-  nonnegative_int batch = 1_n;
-  for (nonnegative_int i : nonnegative_range(2_n, a_input.shape.get_dim())) {
-    nonnegative_int dim_size = a_input.shape.at(legion_dim_t{i});
-    assert(dim_size == b_input.shape.at(legion_dim_t{i}));
-    assert(dim_size == output.shape.at(legion_dim_t{i}));
+  positive_int m = b_input.shape.at(legion_dim_t{0_n});
+  ASSERT(m == output.shape.at(legion_dim_t{0_n}));
+  positive_int n = a_input.shape.at(legion_dim_t{1_n});
+  ASSERT(n == output.shape.at(legion_dim_t{1_n}));
+  positive_int k = a_input.shape.at(legion_dim_t{0_n});
+  ASSERT(k == b_input.shape.at(legion_dim_t{1_n}));
+  ASSERT(a_input.shape.num_elements() == b_input.shape.num_elements());
+  ASSERT(a_input.shape.num_elements() == output.shape.num_elements());
+
+  positive_int batch = 1_p;
+  for (nonnegative_int i : nonnegative_range(2_n, a_input.shape.num_dims())) {
+    positive_int dim_size = a_input.shape.at(legion_dim_t{i});
+    ASSERT(dim_size == b_input.shape.at(legion_dim_t{i}));
+    ASSERT(dim_size == output.shape.at(legion_dim_t{i}));
     batch *= dim_size;
   }
 
@@ -154,10 +154,10 @@ static std::optional<float>
                  a_input_grad.get_float_ptr(),
                  b_input.get_float_ptr(),
                  b_input_grad.get_float_ptr(),
-                 m.unwrap_nonnegative(),
-                 n.unwrap_nonnegative(),
-                 k.unwrap_nonnegative(),
-                 batch.unwrap_nonnegative());
+                 m.int_from_positive_int(),
+                 n.int_from_positive_int(),
+                 k.int_from_positive_int(),
+                 batch.int_from_positive_int());
 }
 
 TaskImplFunction get_batch_matmul_fwd_task_impl() {
diff --git a/lib/task-spec/src/task-spec/ops/batch_norm.cc b/lib/task-spec/src/task-spec/ops/batch_norm.cc
index 2aa308dada..67c5a7d8a2 100644
--- a/lib/task-spec/src/task-spec/ops/batch_norm.cc
+++ b/lib/task-spec/src/task-spec/ops/batch_norm.cc
@@ -75,10 +75,10 @@ static DeviceSpecificDeviceStates
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto const &attrs = acc.get_argument<BatchNormAttrs>(ATTRS);
 
-  nonnegative_int output_w = output.shape.at(legion_dim_t{0_n});
-  nonnegative_int output_h = output.shape.at(legion_dim_t{1_n});
-  nonnegative_int output_c = output.shape.at(legion_dim_t{2_n});
-  nonnegative_int output_n = output.shape.at(legion_dim_t{3_n});
+  positive_int output_w = output.shape.at(legion_dim_t{0_n});
+  positive_int output_h = output.shape.at(legion_dim_t{1_n});
+  positive_int output_c = output.shape.at(legion_dim_t{2_n});
+  positive_int output_n = output.shape.at(legion_dim_t{3_n});
 
   float *runningMean;
 
@@ -86,10 +86,10 @@ static DeviceSpecificDeviceStates
       init_kernel(handle,
                   allocator,
                   runningMean,
-                  output_n.unwrap_nonnegative(),
-                  output_c.unwrap_nonnegative(),
-                  output_h.unwrap_nonnegative(),
-                  output_w.unwrap_nonnegative(),
+                  output_n.int_from_positive_int(),
+                  output_c.int_from_positive_int(),
+                  output_h.int_from_positive_int(),
+                  output_w.int_from_positive_int(),
                   attrs.relu);
 
   return DeviceSpecificDeviceStates{
@@ -141,7 +141,7 @@ static std::optional<float>
                  scale.get_float_ptr(),
                  scale_grad.get_float_ptr(),
                  bias_grad.get_float_ptr(),
-                 output.shape.get_volume().unwrap_nonnegative());
+                 output.shape.num_elements().int_from_positive_int());
 }
 
 TaskImplFunction get_batch_norm_init_task_impl() {
diff --git a/lib/task-spec/src/task-spec/ops/conv_2d.cc b/lib/task-spec/src/task-spec/ops/conv_2d.cc
index 47b889c6ce..ea4f7f79df 100644
--- a/lib/task-spec/src/task-spec/ops/conv_2d.cc
+++ b/lib/task-spec/src/task-spec/ops/conv_2d.cc
@@ -63,13 +63,13 @@ static DeviceSpecificDeviceStates
   Conv2DPerDeviceState per_device_state =
       init_kernel(/*handle=*/handle,
                   /*activation=*/attrs.activation,
-                  /*kernel_h=*/attrs.kernel_h.unwrap_nonnegative(),
-                  /*kernel_w=*/attrs.kernel_w.unwrap_nonnegative(),
-                  /*groups=*/attrs.groups.unwrap_nonnegative(),
+                  /*kernel_h=*/attrs.kernel_h.int_from_positive_int(),
+                  /*kernel_w=*/attrs.kernel_w.int_from_positive_int(),
+                  /*groups=*/attrs.groups.int_from_positive_int(),
                   /*padding_h=*/attrs.padding_h.unwrap_nonnegative(),
                   /*padding_w=*/attrs.padding_w.unwrap_nonnegative(),
-                  /*stride_h=*/attrs.stride_h.unwrap_nonnegative(),
-                  /*stride_w=*/attrs.stride_w.unwrap_nonnegative(),
+                  /*stride_h=*/attrs.stride_h.int_from_positive_int(),
+                  /*stride_w=*/attrs.stride_w.int_from_positive_int(),
                   /*input=*/input,
                   /*output=*/output,
                   /*filter_ptr=*/filter.get_float_ptr(),
diff --git a/lib/task-spec/src/task-spec/ops/gather.cc b/lib/task-spec/src/task-spec/ops/gather.cc
index a0bfaddc0f..5f7173a991 100644
--- a/lib/task-spec/src/task-spec/ops/gather.cc
+++ b/lib/task-spec/src/task-spec/ops/gather.cc
@@ -68,10 +68,10 @@ static DeviceSpecificDeviceStates
   legion_dim_t legion_dim =
       legion_dim_from_ff_dim(attrs.dim, input.shape.num_dims());
 
-  assert(input.shape.get_dim() == index.shape.get_dim());
-  assert(output.shape.get_dim() == index.shape.get_dim());
+  assert(input.shape.num_dims() == index.shape.num_dims());
+  assert(output.shape.num_dims() == index.shape.num_dims());
 
-  for (nonnegative_int i : nonnegative_range(input.shape.get_dim())) {
+  for (nonnegative_int i : nonnegative_range(input.shape.num_dims())) {
     assert(index.shape.at(legion_dim_t{i}) == output.shape.at(legion_dim_t{i}));
     if (i != legion_dim.value) {
       assert(input.shape.at(legion_dim_t{i}) ==
diff --git a/lib/task-spec/src/task-spec/ops/layer_norm.cc b/lib/task-spec/src/task-spec/ops/layer_norm.cc
index c2f16d7eda..7e6c5062e2 100644
--- a/lib/task-spec/src/task-spec/ops/layer_norm.cc
+++ b/lib/task-spec/src/task-spec/ops/layer_norm.cc
@@ -118,25 +118,25 @@ static DeviceSpecificDeviceStates
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
 
-  nonnegative_int M = 1_n;
+  positive_int M = 1_p;
   for (int i = 0; i < attrs.axes.size(); i++) {
     legion_dim_t legion_dim =
         legion_dim_from_ff_dim(attrs.axes[i], input.shape.num_dims());
     M *= input.shape.at(legion_dim);
   }
-  nonnegative_int num_replicas = 1_n;
+  positive_int num_replicas = 1_p;
   for (nonnegative_int i : nonnegative_range(input.shape.num_dims())) {
     num_replicas *= input.shape.at(legion_dim_t{i});
   }
-  nonnegative_int effective_num_elements = M;
-  nonnegative_int effective_batch_size = input.shape.get_volume() / M;
+  positive_int effective_num_elements = M;
+  positive_int effective_batch_size = positive_int{input.shape.num_elements() / M};
 
   LayerNormPerDeviceState per_device_state =
       init_kernel(handle,
                   allocator,
                   attrs.elementwise_affine,
-                  effective_batch_size.unwrap_nonnegative(),
-                  effective_num_elements.unwrap_nonnegative(),
+                  effective_batch_size.int_from_positive_int(),
+                  effective_num_elements.int_from_positive_int(),
                   attrs.eps);
   return DeviceSpecificDeviceStates{
       DeviceSpecific<LayerNormPerDeviceState>::create(per_device_state)};
diff --git a/lib/task-spec/src/task-spec/ops/linear.cc b/lib/task-spec/src/task-spec/ops/linear.cc
index 8d4a81c5c4..3bf8080877 100644
--- a/lib/task-spec/src/task-spec/ops/linear.cc
+++ b/lib/task-spec/src/task-spec/ops/linear.cc
@@ -65,8 +65,8 @@ static DeviceSpecificDeviceStates
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  nonnegative_int out_dim = output.shape.at(ff_dim_t{0_n});
-  nonnegative_int batch_size = output.shape.at(ff_dim_t{1_n});
+  positive_int out_dim = output.shape.at(ff_dim_t{0_n});
+  positive_int batch_size = output.shape.at(ff_dim_t{1_n});
 
   float *one_ptr;
 
@@ -79,8 +79,8 @@ static DeviceSpecificDeviceStates
                   input.data_type,
                   weight.data_type,
                   output.data_type,
-                  batch_size.unwrap_nonnegative(),
-                  attrs.out_channels.unwrap_nonnegative());
+                  batch_size.int_from_positive_int(),
+                  attrs.out_channels.int_from_positive_int());
   return DeviceSpecificDeviceStates{
       DeviceSpecific<LinearPerDeviceState>::create(per_device_state)};
 }
@@ -95,9 +95,9 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto attrs = acc.get_argument<LinearAttrs>(ATTRS);
 
-  nonnegative_int in_dim = input.shape.at(ff_dim_t{0_n});
-  nonnegative_int out_dim = output.shape.at(ff_dim_t{0_n});
-  nonnegative_int batch_size = output.shape.get_volume() / out_dim;
+  positive_int in_dim = input.shape.at(ff_dim_t{0_n});
+  positive_int out_dim = output.shape.at(ff_dim_t{0_n});
+  positive_int batch_size = positive_int{output.shape.num_elements() / out_dim};
 
   float const *bias_ptr = NULL;
   if (attrs.use_bias) {
@@ -113,9 +113,9 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  output.get_float_ptr(),
                  weight.get_float_ptr(),
                  bias_ptr,
-                 in_dim.unwrap_nonnegative(),
-                 out_dim.unwrap_nonnegative(),
-                 batch_size.unwrap_nonnegative());
+                 in_dim.int_from_positive_int(),
+                 out_dim.int_from_positive_int(),
+                 batch_size.int_from_positive_int());
 }
 
 static std::optional<float>
@@ -139,9 +139,9 @@ static std::optional<float>
     bias_grad_ptr = bias_grad.get_float_ptr();
   }
 
-  nonnegative_int in_dim = input.shape.at(ff_dim_t{0_n});
-  nonnegative_int out_dim = output.shape.at(ff_dim_t{0_n});
-  nonnegative_int batch_size = output.shape.get_volume() / out_dim;
+  positive_int in_dim = input.shape.at(ff_dim_t{0_n});
+  positive_int out_dim = output.shape.at(ff_dim_t{0_n});
+  positive_int batch_size = positive_int{output.shape.num_elements() / out_dim};
 
   return profile(backward_kernel,
                  profiling,
@@ -154,9 +154,9 @@ static std::optional<float>
                  weight.get_float_ptr(),
                  weight_grad.get_float_ptr(),
                  bias_grad_ptr,
-                 in_dim.unwrap_nonnegative(),
-                 out_dim.unwrap_nonnegative(),
-                 batch_size.unwrap_nonnegative());
+                 in_dim.int_from_positive_int(),
+                 out_dim.int_from_positive_int(),
+                 batch_size.int_from_positive_int());
 }
 
 TaskImplFunction get_linear_init_task_impl() {
diff --git a/lib/task-spec/src/task-spec/ops/pool_2d.cc b/lib/task-spec/src/task-spec/ops/pool_2d.cc
index d7064ca04d..bceced61d3 100644
--- a/lib/task-spec/src/task-spec/ops/pool_2d.cc
+++ b/lib/task-spec/src/task-spec/ops/pool_2d.cc
@@ -42,32 +42,32 @@ static DeviceSpecificDeviceStates
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
-  nonnegative_int input_w = input.shape.at(ff_dim_t{0_n});
-  nonnegative_int input_h = input.shape.at(ff_dim_t{1_n});
-  nonnegative_int input_c = input.shape.at(ff_dim_t{2_n});
-  nonnegative_int input_n = input.shape.at(ff_dim_t{3_n});
-  nonnegative_int output_w = output.shape.at(ff_dim_t{0_n});
-  nonnegative_int output_h = output.shape.at(ff_dim_t{1_n});
-  nonnegative_int output_c = output.shape.at(ff_dim_t{2_n});
-  nonnegative_int output_n = output.shape.at(ff_dim_t{3_n});
+  positive_int input_w = input.shape.at(ff_dim_t{0_n});
+  positive_int input_h = input.shape.at(ff_dim_t{1_n});
+  positive_int input_c = input.shape.at(ff_dim_t{2_n});
+  positive_int input_n = input.shape.at(ff_dim_t{3_n});
+  positive_int output_w = output.shape.at(ff_dim_t{0_n});
+  positive_int output_h = output.shape.at(ff_dim_t{1_n});
+  positive_int output_c = output.shape.at(ff_dim_t{2_n});
+  positive_int output_n = output.shape.at(ff_dim_t{3_n});
 
   Pool2DPerDeviceState per_device_state =
       init_kernel(handle,
                   attrs.activation,
-                  input_w.unwrap_nonnegative(),
-                  input_h.unwrap_nonnegative(),
-                  input_c.unwrap_nonnegative(),
-                  input_n.unwrap_nonnegative(),
-                  output_w.unwrap_nonnegative(),
-                  output_h.unwrap_nonnegative(),
-                  output_c.unwrap_nonnegative(),
-                  output_n.unwrap_nonnegative(),
+                  input_w.int_from_positive_int(),
+                  input_h.int_from_positive_int(),
+                  input_c.int_from_positive_int(),
+                  input_n.int_from_positive_int(),
+                  output_w.int_from_positive_int(),
+                  output_h.int_from_positive_int(),
+                  output_c.int_from_positive_int(),
+                  output_n.int_from_positive_int(),
                   attrs.padding_h.unwrap_nonnegative(),
                   attrs.padding_w.unwrap_nonnegative(),
-                  attrs.kernel_h.unwrap_nonnegative(),
-                  attrs.kernel_w.unwrap_nonnegative(),
-                  attrs.stride_h.unwrap_nonnegative(),
-                  attrs.stride_w.unwrap_nonnegative(),
+                  attrs.kernel_h.int_from_positive_int(),
+                  attrs.kernel_w.int_from_positive_int(),
+                  attrs.stride_h.int_from_positive_int(),
+                  attrs.stride_w.int_from_positive_int(),
                   attrs.pool_type);
 
   return DeviceSpecificDeviceStates{
diff --git a/lib/task-spec/src/task-spec/ops/reduce.cc b/lib/task-spec/src/task-spec/ops/reduce.cc
index ccc1285aaa..3efac36c3f 100644
--- a/lib/task-spec/src/task-spec/ops/reduce.cc
+++ b/lib/task-spec/src/task-spec/ops/reduce.cc
@@ -40,7 +40,7 @@ static DeviceSpecificDeviceStates
   OperatorType op_type = attrs.op_type;
 
   nonnegative_int reduction_size =
-      input.shape.get_volume() / output.shape.get_volume();
+      input.shape.num_elements() / output.shape.num_elements();
   ReducePerDeviceState per_device_state =
       init_kernel(handle,
                   op_type,
diff --git a/lib/task-spec/src/task-spec/ops/reduction.cc b/lib/task-spec/src/task-spec/ops/reduction.cc
index 96e2c6c506..48f4c0e98d 100644
--- a/lib/task-spec/src/task-spec/ops/reduction.cc
+++ b/lib/task-spec/src/task-spec/ops/reduction.cc
@@ -49,14 +49,14 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto attrs = acc.get_argument<ReductionAttrs>(ATTRS);
 
-  nonnegative_int num_replicas = attrs.reduction_degree;
+  positive_int num_replicas = attrs.reduction_degree;
 
   return profile(forward_kernel,
                  profiling_settings,
                  "[Reduction] forward_time = {:.2lf}ms\n",
                  input,
                  output,
-                 num_replicas.unwrap_nonnegative());
+                 num_replicas.int_from_positive_int());
 }
 
 static std::optional<float>
diff --git a/lib/task-spec/src/task-spec/ops/replicate.cc b/lib/task-spec/src/task-spec/ops/replicate.cc
index 0ed5d98708..e91414bc16 100644
--- a/lib/task-spec/src/task-spec/ops/replicate.cc
+++ b/lib/task-spec/src/task-spec/ops/replicate.cc
@@ -68,7 +68,7 @@ static std::optional<float>
                  "[replicate] backward_time = {:.2lf}ms\n",
                  output_grad,
                  input_grad,
-                 attrs.replicate_degree.unwrap_nonnegative());
+                 attrs.replicate_degree.int_from_positive_int());
 }
 
 TaskImplFunction get_replicate_fwd_task_impl() {
diff --git a/lib/task-spec/src/task-spec/ops/softmax.cc b/lib/task-spec/src/task-spec/ops/softmax.cc
index d7b27fd884..81239d1a67 100644
--- a/lib/task-spec/src/task-spec/ops/softmax.cc
+++ b/lib/task-spec/src/task-spec/ops/softmax.cc
@@ -58,18 +58,18 @@ static DeviceSpecificDeviceStates
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto const &attrs = acc.get_argument<SoftmaxAttrs>(ATTRS);
 
-  nonnegative_int output_w = output.shape.at(legion_dim_t{0_n});
-  nonnegative_int output_h = output.shape.at(legion_dim_t{1_n});
-  nonnegative_int output_c = output.shape.at(legion_dim_t{2_n});
-  nonnegative_int output_n = output.shape.at(legion_dim_t{3_n});
+  positive_int output_w = output.shape.at(legion_dim_t{0_n});
+  positive_int output_h = output.shape.at(legion_dim_t{1_n});
+  positive_int output_c = output.shape.at(legion_dim_t{2_n});
+  positive_int output_n = output.shape.at(legion_dim_t{3_n});
 
   SoftmaxPerDeviceState per_device_state =
       init_kernel(handle,
                   attrs.dim.value.unwrap_nonnegative(),
-                  output_n.unwrap_nonnegative(),
-                  output_c.unwrap_nonnegative(),
-                  output_h.unwrap_nonnegative(),
-                  output_w.unwrap_nonnegative());
+                  output_n.int_from_positive_int(),
+                  output_c.int_from_positive_int(),
+                  output_h.int_from_positive_int(),
+                  output_w.int_from_positive_int());
 
   return DeviceSpecificDeviceStates{
       DeviceSpecific<SoftmaxPerDeviceState>::create(per_device_state)};
@@ -108,7 +108,7 @@ static std::optional<float>
                  "[SoftMax] backward_time = {:.2lf}ms\n",
                  output_grad.get_float_ptr(),
                  input_grad.get_float_ptr(),
-                 output_grad.shape.get_volume().unwrap_nonnegative());
+                 output_grad.shape.num_elements().int_from_positive_int());
 }
 
 TaskImplFunction get_softmax_init_task_impl() {
diff --git a/lib/task-spec/src/task-spec/ops/split.cc b/lib/task-spec/src/task-spec/ops/split.cc
index a14f6a587d..aa3184c999 100644
--- a/lib/task-spec/src/task-spec/ops/split.cc
+++ b/lib/task-spec/src/task-spec/ops/split.cc
@@ -44,11 +44,11 @@ OpTaskInvocation backward(SplitAttrs const &attrs) {
   return {task_id_t::SPLIT_BWD_TASK_ID, binding};
 }
 
-static std::pair<nonnegative_int, nonnegative_int>
+static std::pair<positive_int, positive_int>
     calc_block_size(ArrayShape const &array_shape, ff_dim_t axis) {
-  nonnegative_int num_blocks = 1_n;
-  nonnegative_int block_size = 1_n;
-  for (nonnegative_int d : nonnegative_range(array_shape.num_elements())) {
+  positive_int num_blocks = 1_p;
+  positive_int block_size = 1_p;
+  for (nonnegative_int d : nonnegative_range(array_shape.num_elements().nonnegative_int_from_positive_int())) {
     if (d <= axis.value) {
       block_size *= array_shape.at(legion_dim_t{d});
     } else {
@@ -69,7 +69,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   for (int i = 0; i < attrs.splits.size(); i++) {
     auto [_, out_block_size] = calc_block_size(output.shape, attrs.axis);
-    out_block_sizes[i] = out_block_size.unwrap_nonnegative();
+    out_block_sizes[i] = out_block_size.int_from_positive_int();
   }
   float *output_float_ptr = output.get_float_ptr();
   return profile(forward_kernel,
@@ -78,8 +78,8 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  &output_float_ptr,
                  input.get_float_ptr(),
                  out_block_sizes,
-                 in_block_size.unwrap_nonnegative(),
-                 num_blocks.unwrap_nonnegative(),
+                 in_block_size.int_from_positive_int(),
+                 num_blocks.int_from_positive_int(),
                  attrs.splits.size());
 }
 
@@ -98,7 +98,7 @@ static std::optional<float>
   for (int i = 0; i < attrs.splits.size(); i++) {
     coord_t out_num_blocks;
     auto [_, out_block_size] = calc_block_size(output_grad.shape, attrs.axis);
-    out_block_sizes[i] = out_block_size.unwrap_nonnegative();
+    out_block_sizes[i] = out_block_size.int_from_positive_int();
   }
   float const *output_grad_ptr = output_grad.get_float_ptr();
   return profile(backward_kernel,
@@ -107,8 +107,8 @@ static std::optional<float>
                  input_grad.get_float_ptr(),
                  &output_grad_ptr,
                  out_block_sizes,
-                 in_block_size.unwrap_nonnegative(),
-                 num_blocks.unwrap_nonnegative(),
+                 in_block_size.int_from_positive_int(),
+                 num_blocks.int_from_positive_int(),
                  attrs.splits.size());
 }
 
diff --git a/lib/task-spec/src/task-spec/ops/topk.cc b/lib/task-spec/src/task-spec/ops/topk.cc
index 11f1fffa41..ea2d855bf6 100644
--- a/lib/task-spec/src/task-spec/ops/topk.cc
+++ b/lib/task-spec/src/task-spec/ops/topk.cc
@@ -74,8 +74,8 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
-  nonnegative_int length = input.shape.at(legion_dim_t{0_n});
-  nonnegative_int batch_size = input.shape.get_volume() / length;
+  positive_int length = input.shape.at(legion_dim_t{0_n});
+  positive_int batch_size = positive_int{input.shape.num_elements() / length};
   auto indices = acc.get_tensor<Permissions::WO>(INDICES);
 
   return profile(forward_kernel,
@@ -85,9 +85,9 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  input.get_float_ptr(),
                  output.get_float_ptr(),
                  indices.get_int32_ptr(),
-                 batch_size.unwrap_nonnegative(),
-                 length.unwrap_nonnegative(),
-                 attrs.k.unwrap_nonnegative(),
+                 batch_size.int_from_positive_int(),
+                 length.int_from_positive_int(),
+                 attrs.k.int_from_positive_int(),
                  attrs.sorted);
 }
 
@@ -103,8 +103,8 @@ static std::optional<float>
 
   auto indices = acc.get_tensor<Permissions::RO>(INDICES);
 
-  nonnegative_int length = input_grad.shape.at(legion_dim_t{0_n});
-  nonnegative_int batch_size = input_grad.shape.get_volume() / length;
+  positive_int length = input_grad.shape.at(legion_dim_t{0_n});
+  positive_int batch_size = positive_int{input_grad.shape.num_elements() / length};
 
   return profile(backward_kernel,
                  profiling,
@@ -113,9 +113,9 @@ static std::optional<float>
                  output_grad.get_float_ptr(),
                  indices.get_int32_ptr(),
                  input_grad.get_float_ptr(),
-                 batch_size.unwrap_nonnegative(),
-                 length.unwrap_nonnegative(),
-                 attrs.k.unwrap_nonnegative());
+                 batch_size.int_from_positive_int(),
+                 length.int_from_positive_int(),
+                 attrs.k.int_from_positive_int());
 }
 
 TaskImplFunction get_topk_init_task_impl() {
diff --git a/lib/task-spec/test/src/task-spec/arg_ref.cc b/lib/task-spec/test/src/task-spec/arg_ref.cc
index e1c5a9bd8d..dcc2e9e580 100644
--- a/lib/task-spec/test/src/task-spec/arg_ref.cc
+++ b/lib/task-spec/test/src/task-spec/arg_ref.cc
@@ -10,8 +10,6 @@ enum class ExampleLabelType {
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ArgRefSpec::holds") {
-    CHECK_MESSAGE(false, "TODO: ArgRefSpec");
-
     ArgRefSpec<ExampleLabelType> arg_ref_spec = ArgRefSpec<ExampleLabelType>::create(
       ArgRef<ExampleLabelType, std::string>{ExampleLabelType::STRING}
     );
diff --git a/lib/utils/include/utils/containers/sum.h b/lib/utils/include/utils/containers/sum.h
index d6061e396e..a725879f76 100644
--- a/lib/utils/include/utils/containers/sum.h
+++ b/lib/utils/include/utils/containers/sum.h
@@ -1,6 +1,8 @@
 #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_SUM_H
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_SUM_H
 
+#include <optional>
+
 namespace FlexFlow {
 
 /**
@@ -8,11 +10,20 @@ namespace FlexFlow {
  **/
 template <typename Container, typename Element = typename Container::value_type>
 Element sum(Container const &container) {
-  Element result = Element{0};
+  std::optional<Element> result;
   for (Element const &element : container) {
-    result += element;
+    if (result.has_value()) {
+      result.value() += element;
+    } else {
+      result = element;
+    }
+  }
+
+  if (result.has_value()) {
+    return result.value();
+  } else {
+    return Element{0};
   }
-  return result;
 }
 
 } // namespace FlexFlow
diff --git a/lib/utils/include/utils/nonnegative_int/ceildiv.h b/lib/utils/include/utils/nonnegative_int/ceildiv.h
index 939ea3de51..e2ff0bc52a 100644
--- a/lib/utils/include/utils/nonnegative_int/ceildiv.h
+++ b/lib/utils/include/utils/nonnegative_int/ceildiv.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_CEILDIV_H
 
 #include "utils/nonnegative_int/nonnegative_int.h"
+
 namespace FlexFlow {
 
 nonnegative_int ceildiv(nonnegative_int numerator, nonnegative_int denominator);
diff --git a/lib/utils/include/utils/positive_int/ceildiv.h b/lib/utils/include/utils/positive_int/ceildiv.h
new file mode 100644
index 0000000000..961e3ca298
--- /dev/null
+++ b/lib/utils/include/utils/positive_int/ceildiv.h
@@ -0,0 +1,12 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_POSITIVE_INT_CEILDIV_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_POSITIVE_INT_CEILDIV_H
+
+#include "utils/positive_int/positive_int.h"
+
+namespace FlexFlow {
+
+positive_int ceildiv(positive_int numerator, positive_int denominator);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/positive_int/positive_int.h b/lib/utils/include/utils/positive_int/positive_int.h
new file mode 100644
index 0000000000..9ff0f4da64
--- /dev/null
+++ b/lib/utils/include/utils/positive_int/positive_int.h
@@ -0,0 +1,114 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_POSITIVE_INT_POSITIVE_INT_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_POSITIVE_INT_POSITIVE_INT_H
+
+#include "utils/nonnegative_int/nonnegative_int.h"
+
+namespace FlexFlow {
+
+struct positive_int {
+  positive_int() = delete;
+  explicit positive_int(int value);
+  explicit positive_int(size_t value);
+  explicit positive_int(nonnegative_int value);
+
+  explicit operator int() const noexcept;
+  explicit operator nonnegative_int() const noexcept;
+
+  bool operator<(positive_int other) const;
+  bool operator==(positive_int other) const;
+  bool operator>(positive_int other) const;
+  bool operator<=(positive_int other) const;
+  bool operator!=(positive_int other) const;
+  bool operator>=(positive_int other) const;
+
+  bool operator<(nonnegative_int other) const;
+  bool operator==(nonnegative_int other) const;
+  bool operator>(nonnegative_int other) const;
+  bool operator<=(nonnegative_int other) const;
+  bool operator!=(nonnegative_int other) const;
+  bool operator>=(nonnegative_int other) const;
+
+  friend bool operator<(nonnegative_int lhs, positive_int rhs);
+  friend bool operator==(nonnegative_int lhs, positive_int rhs);
+  friend bool operator>(nonnegative_int lhs, positive_int rhs);
+  friend bool operator<=(nonnegative_int lhs, positive_int rhs);
+  friend bool operator!=(nonnegative_int lhs, positive_int rhs);
+  friend bool operator>=(nonnegative_int lhs, positive_int rhs);
+
+  bool operator<(int other) const;
+  bool operator==(int other) const;
+  bool operator>(int other) const;
+  bool operator<=(int other) const;
+  bool operator!=(int other) const;
+  bool operator>=(int other) const;
+
+  friend bool operator<(int lhs, positive_int rhs);
+  friend bool operator==(int lhs, positive_int rhs);
+  friend bool operator>(int lhs, positive_int rhs);
+  friend bool operator<=(int lhs, positive_int rhs);
+  friend bool operator!=(int lhs, positive_int rhs);
+  friend bool operator>=(int lhs, positive_int rhs);
+
+  positive_int operator+(positive_int other) const;
+  positive_int operator+(nonnegative_int other) const;
+  positive_int &operator++();
+  positive_int operator++(int);
+  positive_int &operator+=(positive_int other);
+  positive_int &operator+=(nonnegative_int other);
+
+  positive_int operator*(positive_int other) const;
+  positive_int &operator*=(positive_int other);
+  nonnegative_int operator*(nonnegative_int other) const;
+
+  friend nonnegative_int operator*(nonnegative_int lhs, positive_int rhs);
+
+  nonnegative_int operator/(positive_int other) const;
+  friend nonnegative_int operator/(nonnegative_int lhs, positive_int rhs);
+
+  friend float operator/(float lhs, positive_int rhs);
+  friend float &operator/=(float &lhs, positive_int rhs);
+
+  nonnegative_int operator%(positive_int other) const;
+  nonnegative_int operator%(nonnegative_int other) const;
+
+  int int_from_positive_int() const;
+  nonnegative_int nonnegative_int_from_positive_int() const;
+
+  friend std::ostream &operator<<(std::ostream &os, positive_int n);
+
+  friend int format_as(positive_int);
+
+private:
+  void check_invariant() const;
+
+private:
+  int value_;
+};
+
+positive_int operator""_p(unsigned long long int);
+
+} // namespace FlexFlow
+
+namespace nlohmann {
+template <>
+struct adl_serializer<::FlexFlow::positive_int> {
+  static ::FlexFlow::positive_int from_json(json const &j);
+  static void to_json(json &j, ::FlexFlow::positive_int t);
+};
+} // namespace nlohmann
+
+namespace rc {
+template <>
+struct Arbitrary<::FlexFlow::positive_int> {
+  static Gen<::FlexFlow::positive_int> arbitrary();
+};
+} // namespace rc
+
+namespace std {
+template <>
+struct hash<::FlexFlow::positive_int> {
+  std::size_t operator()(FlexFlow::positive_int n) const noexcept;
+};
+} // namespace std
+
+#endif
diff --git a/lib/utils/src/utils/nonnegative_int/ceildiv.cc b/lib/utils/src/utils/nonnegative_int/ceildiv.cc
deleted file mode 100644
index f1115b25b5..0000000000
--- a/lib/utils/src/utils/nonnegative_int/ceildiv.cc
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "utils/nonnegative_int/ceildiv.h"
-#include "utils/exception.h"
-
-namespace FlexFlow {
-
-nonnegative_int ceildiv(nonnegative_int numerator,
-                        nonnegative_int denominator) {
-  if (denominator == 0) {
-    throw mk_runtime_error(fmt::format(
-        "ceildiv expected denominator != 0, but received {}", denominator));
-  }
-
-  int n = numerator.unwrap_nonnegative();
-  int d = denominator.unwrap_nonnegative();
-
-  int result = (n + d - 1) / d;
-  return nonnegative_int{result};
-}
-
-} // namespace FlexFlow
diff --git a/lib/utils/src/utils/positive_int/ceildiv.cc b/lib/utils/src/utils/positive_int/ceildiv.cc
new file mode 100644
index 0000000000..b642db4edd
--- /dev/null
+++ b/lib/utils/src/utils/positive_int/ceildiv.cc
@@ -0,0 +1,14 @@
+#include "utils/positive_int/ceildiv.h"
+#include "utils/exception.h"
+
+namespace FlexFlow {
+
+positive_int ceildiv(positive_int numerator, positive_int denominator) {
+  int n = numerator.int_from_positive_int();
+  int d = denominator.int_from_positive_int();
+
+  int result = (n + d - 1) / d;
+  return positive_int{result};
+}
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/positive_int/positive_int.cc b/lib/utils/src/utils/positive_int/positive_int.cc
new file mode 100644
index 0000000000..70233e74d8
--- /dev/null
+++ b/lib/utils/src/utils/positive_int/positive_int.cc
@@ -0,0 +1,283 @@
+#include "utils/positive_int/positive_int.h"
+#include <libassert/assert.hpp>
+
+namespace FlexFlow {
+
+positive_int::positive_int(int value) 
+  : value_(value)
+{
+  this->check_invariant();
+}
+
+positive_int::positive_int(size_t value)
+  : value_(value)
+{
+  this->check_invariant();
+}
+
+positive_int::positive_int(nonnegative_int value)
+  : value_(value.unwrap_nonnegative())
+{
+  this->check_invariant();
+}
+
+positive_int::operator int() const noexcept {
+  return this->value_;
+}
+
+positive_int::operator nonnegative_int() const noexcept {
+  return nonnegative_int{this->value_};
+}
+
+bool positive_int::operator<(positive_int other) const {
+  return this->value_ < other.value_;
+}
+
+bool positive_int::operator==(positive_int other) const {
+  return this->value_ == other.value_;
+}
+
+bool positive_int::operator>(positive_int other) const {
+  return this->value_ > other.value_;
+}
+
+bool positive_int::operator<=(positive_int other) const {
+  return this->value_ <= other.value_;
+}
+
+bool positive_int::operator!=(positive_int other) const {
+  return this->value_ != other.value_;
+}
+
+bool positive_int::operator>=(positive_int other) const {
+  return this->value_ >= other.value_;
+}
+
+bool positive_int::operator<(nonnegative_int other) const {
+  return this->value_ < other;
+}
+
+bool positive_int::operator==(nonnegative_int other) const {
+  return this->value_ == other;
+}
+
+bool positive_int::operator>(nonnegative_int other) const {
+  return this->value_ > other;
+}
+
+bool positive_int::operator<=(nonnegative_int other) const {
+  return this->value_ <= other;
+}
+
+bool positive_int::operator!=(nonnegative_int other) const {
+  return this->value_ != other;
+}
+
+bool positive_int::operator>=(nonnegative_int other) const {
+  return this->value_ >= other;
+}
+
+bool operator<(nonnegative_int lhs, positive_int rhs) {
+  return lhs < rhs.value_;
+}
+
+bool operator==(nonnegative_int lhs, positive_int rhs) {
+  return lhs == rhs.value_;
+}
+
+bool operator>(nonnegative_int lhs, positive_int rhs) {
+  return lhs > rhs.value_;
+}
+
+bool operator<=(nonnegative_int lhs, positive_int rhs) {
+  return lhs <= rhs.value_;
+}
+
+bool operator!=(nonnegative_int lhs, positive_int rhs) {
+  return lhs != rhs.value_;
+}
+
+bool operator>=(nonnegative_int lhs, positive_int rhs) {
+  return lhs >= rhs.value_;
+}
+
+bool positive_int::operator<(int other) const {
+  return this->value_ < other;
+}
+
+bool positive_int::operator==(int other) const {
+  return this->value_ == other;
+}
+
+bool positive_int::operator>(int other) const {
+  return this->value_ > other;
+}
+
+bool positive_int::operator<=(int other) const {
+  return this->value_ <= other;
+}
+
+bool positive_int::operator!=(int other) const {
+  return this->value_ != other;
+}
+
+bool positive_int::operator>=(int other) const {
+  return this->value_ >= other;
+}
+
+bool operator<(int lhs, positive_int rhs) {
+  return lhs < rhs.value_;
+}
+
+bool operator==(int lhs, positive_int rhs) {
+  return lhs == rhs.value_;
+}
+
+bool operator>(int lhs, positive_int rhs) {
+  return lhs > rhs.value_;
+}
+
+bool operator<=(int lhs, positive_int rhs) {
+  return lhs <= rhs.value_;
+}
+
+bool operator!=(int lhs, positive_int rhs) {
+  return lhs != rhs.value_;
+}
+
+bool operator>=(int lhs, positive_int rhs) {
+  return lhs >= rhs.value_;
+}
+
+positive_int positive_int::operator+(positive_int other) const {
+  return positive_int{this->value_ + other.value_};
+}
+
+positive_int positive_int::operator+(nonnegative_int other) const {
+  return positive_int{this->value_ + other.unwrap_nonnegative()};
+}
+
+positive_int &positive_int::operator++() {
+  this->value_++;
+  this->check_invariant();
+  return *this;
+}
+
+positive_int positive_int::operator++(int) {
+  positive_int result = *this;
+  this->value_++;
+  this->check_invariant();
+  return result;
+}
+
+positive_int &positive_int::operator+=(positive_int other) {
+  this->value_ += other.value_;
+  this->check_invariant();
+  return *this;
+}
+
+positive_int &positive_int::operator+=(nonnegative_int other) {
+  this->value_ += other.unwrap_nonnegative();
+  this->check_invariant();
+  return *this;
+}
+
+positive_int positive_int::operator*(positive_int other) const {
+  return positive_int{this->value_ * other.value_};
+}
+
+positive_int &positive_int::operator*=(positive_int other) {
+  this->value_ *= other.value_;
+  this->check_invariant();
+  return *this;
+}
+
+nonnegative_int positive_int::operator*(nonnegative_int other) const {
+  return other * *this;
+}
+
+
+nonnegative_int operator*(nonnegative_int lhs, positive_int rhs) {
+  return lhs * rhs.nonnegative_int_from_positive_int();
+}
+
+nonnegative_int positive_int::operator/(positive_int other) const {
+  return nonnegative_int{this->value_ / other.value_};
+}
+
+nonnegative_int operator/(nonnegative_int lhs, positive_int rhs) {
+  return nonnegative_int{lhs.unwrap_nonnegative() / rhs.value_};
+}
+
+float operator/(float lhs, positive_int rhs) {
+  return lhs / rhs.value_;
+}
+
+float &operator/=(float &lhs, positive_int rhs) {
+  return (lhs /= rhs.value_);
+}
+
+nonnegative_int positive_int::operator%(positive_int other) const {
+  return nonnegative_int{this->value_ % other.value_};
+}
+
+nonnegative_int positive_int::operator%(nonnegative_int other) const {
+  return nonnegative_int{this->value_ % other.unwrap_nonnegative()};
+}
+
+int positive_int::int_from_positive_int() const {
+  return this->value_;
+}
+
+nonnegative_int positive_int::nonnegative_int_from_positive_int() const {
+  return nonnegative_int{this->value_};
+}
+
+std::ostream &operator<<(std::ostream &os, positive_int n) {
+  os << n.value_;
+  return os;
+}
+
+int format_as(positive_int x) {
+  return x.value_;
+}
+
+void positive_int::check_invariant() const {
+  ASSERT(this->value_ > 0);
+}
+
+positive_int operator""_p(unsigned long long int x) {
+  ASSERT(x <= static_cast<unsigned long long int>(std::numeric_limits<int>::max()));
+
+  return positive_int{static_cast<int>(x)};
+
+}
+
+} // namespace FlexFlow
+
+namespace nlohmann {
+::FlexFlow::positive_int
+    adl_serializer<::FlexFlow::positive_int>::from_json(json const &j) {
+  return ::FlexFlow::positive_int{j.template get<int>()};
+}
+
+void adl_serializer<::FlexFlow::positive_int>::to_json(
+    json &j, ::FlexFlow::positive_int t) {
+  j = t.int_from_positive_int();
+}
+} // namespace nlohmann
+
+namespace rc {
+Gen<::FlexFlow::positive_int>
+    Arbitrary<::FlexFlow::positive_int>::arbitrary() {
+  return gen::construct<::FlexFlow::positive_int>(gen::positive<int>());
+}
+} // namespace rc
+
+namespace std {
+std::size_t hash<::FlexFlow::positive_int>::operator()(
+    FlexFlow::positive_int n) const noexcept {
+  return std::hash<int>{}(n.int_from_positive_int());
+}
+
+} // namespace std
diff --git a/lib/utils/test/src/utils/containers/sum.cc b/lib/utils/test/src/utils/containers/sum.cc
index 32d8cd32a3..2e335b1051 100644
--- a/lib/utils/test/src/utils/containers/sum.cc
+++ b/lib/utils/test/src/utils/containers/sum.cc
@@ -1,6 +1,7 @@
 #include "utils/containers/sum.h"
 #include <doctest/doctest.h>
 #include <vector>
+#include "utils/positive_int/positive_int.h"
 
 using namespace ::FlexFlow;
 
@@ -24,4 +25,21 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(result == correct);
     }
   }
+
+  TEST_CASE("sum(std::vector<positive_int>)") {
+    SUBCASE("returns the sum if the input is not empty") {
+      std::vector<positive_int> input = {3_p, 9_p, 3_p}; 
+
+      positive_int result = sum(input);
+      positive_int correct = 15_p;
+
+      CHECK(result == correct);
+    } 
+
+    SUBCASE("throws an error if the input is empty, as then 0 should be returned") {
+      std::vector<positive_int> input = {}; 
+
+      CHECK_THROWS(sum(input));
+    }
+  }
 }
diff --git a/lib/utils/test/src/utils/nonnegative_int/ceildiv.cc b/lib/utils/test/src/utils/nonnegative_int/ceildiv.cc
deleted file mode 100644
index 7ac882ff9f..0000000000
--- a/lib/utils/test/src/utils/nonnegative_int/ceildiv.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-#include "utils/nonnegative_int/ceildiv.h"
-#include <doctest/doctest.h>
-
-using namespace ::FlexFlow;
-
-TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("ceildiv(nonnegative_int, nonnegative_int)") {
-    SUBCASE("divides evenly") {
-      nonnegative_int numerator = 12_n;
-      nonnegative_int denominator = 3_n;
-
-      nonnegative_int result = ceildiv(numerator, denominator);
-      nonnegative_int correct = 4_n;
-
-      CHECK(result == correct);
-    }
-
-    SUBCASE("does not divide evenly") {
-      nonnegative_int numerator = 17_n;
-      nonnegative_int denominator = 4_n;
-
-      nonnegative_int result = ceildiv(numerator, denominator);
-      nonnegative_int correct = 5_n;
-
-      CHECK(result == correct);
-    }
-
-    SUBCASE("denominator is zero") {
-      nonnegative_int numerator = 15_n;
-      nonnegative_int denominator = 0_n;
-
-      CHECK_THROWS(ceildiv(numerator, denominator));
-    }
-
-    SUBCASE("numerator is zero") {
-      nonnegative_int numerator = 0_n;
-      nonnegative_int denominator = 1_n;
-
-      nonnegative_int result = ceildiv(numerator, denominator);
-      nonnegative_int correct = 0_n;
-
-      CHECK(result == correct);
-    }
-
-    SUBCASE("denominator and numerator are zero") {
-      nonnegative_int numerator = 0_n;
-      nonnegative_int denominator = 0_n;
-
-      CHECK_THROWS(ceildiv(numerator, denominator));
-    }
-  }
-}
diff --git a/lib/utils/test/src/utils/positive_int/ceildiv.cc b/lib/utils/test/src/utils/positive_int/ceildiv.cc
new file mode 100644
index 0000000000..7c37e06d4d
--- /dev/null
+++ b/lib/utils/test/src/utils/positive_int/ceildiv.cc
@@ -0,0 +1,28 @@
+#include "utils/positive_int/ceildiv.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("ceildiv(positive_int, positive_int)") {
+    SUBCASE("divides evenly") {
+      positive_int numerator = 12_p;
+      positive_int denominator = 3_p;
+
+      positive_int result = ceildiv(numerator, denominator);
+      positive_int correct = 4_p;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("does not divide evenly") {
+      positive_int numerator = 17_p;
+      positive_int denominator = 4_p;
+
+      positive_int result = ceildiv(numerator, denominator);
+      positive_int correct = 5_p;
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/utils/test/src/utils/positive_int/positive_int.cc b/lib/utils/test/src/utils/positive_int/positive_int.cc
new file mode 100644
index 0000000000..25348d34da
--- /dev/null
+++ b/lib/utils/test/src/utils/positive_int/positive_int.cc
@@ -0,0 +1,10 @@
+#include <doctest/doctest.h>
+#include "utils/positive_int/positive_int.h"
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("positive_int") {
+    CHECK_MESSAGE(false, "TODO: positive_int");
+  }
+}

From a266a79e1092f395c14cbb3225d932389a293621 Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Wed, 21 May 2025 08:20:50 +0000
Subject: [PATCH 74/91] Format

---
 .../src/compiler/allowed_machine_views.cc     |  19 +-
 ...ion_graph_series_parallel_decomposition.cc |  11 +-
 lib/kernels/include/kernels/accessor.h        |   1 -
 lib/kernels/include/kernels/array_coord.h     |   5 +-
 lib/kernels/include/kernels/array_shape.h     |   5 +-
 .../kernels/compare_tensor_accessors.h        |  52 +++---
 .../kernels/create_accessor_with_contents.h   |  36 ++--
 .../include/kernels/fill_tensor_accessor.h    |  10 +-
 lib/kernels/include/kernels/legion_dim.h      |   4 +-
 .../kernels/managed_per_device_ff_handle.h    |  14 +-
 .../include/kernels/map_tensor_accessors.h    |  60 ++++---
 .../include/kernels/reduce_tensor_accessor.h  |  66 ++++---
 lib/kernels/src/cpu/ops/combine_kernels.cc    |   3 +-
 lib/kernels/src/cpu/ops/replicate_kernels.cc  |   4 +-
 lib/kernels/src/cuda/ops/combine_kernels.cu   |  16 +-
 lib/kernels/src/cuda/ops/gather_kernels.cu    |   3 +-
 lib/kernels/src/cuda/ops/partition_kernels.cu |  13 +-
 lib/kernels/src/cuda/ops/reduction_kernels.cu |  13 +-
 lib/kernels/src/cuda/ops/replicate_kernels.cu |  13 +-
 lib/kernels/src/cuda/ops/reshape_kernels.cu   |  13 +-
 lib/kernels/src/cuda/ops/transpose_kernels.cu |   4 +-
 lib/kernels/src/cuda/optimizer_kernels.cu     |  22 +--
 lib/kernels/src/kernels/accessor.cc           |  13 +-
 lib/kernels/src/kernels/array_coord.cc        |   5 +-
 lib/kernels/src/kernels/array_shape.cc        |  19 +-
 .../src/kernels/compare_tensor_accessors.cc   | 104 ++++++-----
 .../kernels/create_accessor_with_contents.cc  |  49 ++----
 .../src/kernels/fill_tensor_accessor.cc       |  14 +-
 .../src/kernels/format_accessor_contents.cc   |  46 +++--
 .../src/kernels/map_tensor_accessors.cc       |  26 +--
 .../src/kernels/reduce_tensor_accessor.cc     |  15 +-
 .../src/kernels/tensor_accessor_reductions.cc |  18 +-
 .../src/managed_per_device_ff_handle.cc       |  34 ++--
 .../test/src/cpu/ops/replicate_kernels.cc     |   5 +-
 .../test/src/cpu/ops/reverse_kernels.cc       | 130 +++++++-------
 lib/kernels/test/src/internal/test_utils.cc   |   9 +-
 lib/kernels/test/src/kernels/accessor.cc      |  28 +--
 lib/kernels/test/src/kernels/array_coord.cc   |  27 +--
 lib/kernels/test/src/kernels/array_shape.cc   |  25 ++-
 .../src/kernels/compare_tensor_accessors.cc   | 166 +++++++++---------
 .../kernels/create_accessor_with_contents.cc  |  89 +++++-----
 .../src/kernels/format_accessor_contents.cc   |  55 +++---
 .../test/src/kernels/map_tensor_accessors.cc  |  49 ++++--
 .../src/kernels/reduce_tensor_accessor.cc     |  78 ++++----
 .../src/kernels/tensor_accessor_reductions.cc | 106 +++++------
 lib/kernels/test/src/test_attention_kernel.cc |   5 +-
 .../test/src/test_batch_matmul_kernel.cc      |   5 +-
 .../test/src/test_batch_norm_kernel.cc        |   5 +-
 lib/kernels/test/src/test_combine_kernel.cc   |   5 +-
 lib/kernels/test/src/test_concat_kernel.cc    |   5 +-
 lib/kernels/test/src/test_dropout.cc          |   5 +-
 lib/kernels/test/src/test_flat_kernel.cc      |   5 +-
 lib/kernels/test/src/test_gather_kernels.cc   |   5 +-
 .../test/src/test_layer_norm_kernels.cc       |   5 +-
 .../test/src/test_managed_ff_stream.cc        |   5 +-
 .../src/test_managed_per_device_ff_handle.cc  |  16 +-
 lib/kernels/test/src/test_partition_kernel.cc |   5 +-
 lib/kernels/test/src/test_pool_2d_kernels.cc  |  41 +++--
 lib/kernels/test/src/test_reduction_kernel.cc |   5 +-
 lib/kernels/test/src/test_replicate_kernel.cc |  32 ++--
 lib/kernels/test/src/test_reshape_kernel.cc   |   5 +-
 lib/kernels/test/src/test_reverse_kernels.cc  |  10 +-
 lib/kernels/test/src/test_softmax_kernel.cc   |   5 +-
 lib/kernels/test/src/test_split_kernel.cc     |   5 +-
 lib/kernels/test/src/test_transpose_kernel.cc |   5 +-
 .../local_task_argument_accessor.h            |   2 +-
 .../include/local-execution/loss_functions.h  |   2 +-
 .../include/local-execution/optimizer.h       |   2 +-
 lib/local-execution/src/allocated_tensors.cc  |  11 +-
 .../src/local_training_backing.cc             |   2 +-
 lib/local-execution/src/loss_functions.cc     |  24 ++-
 lib/local-execution/src/task_registry.cc      |   2 +-
 .../test/src/test_allocated_tensors.cc        |  11 +-
 lib/local-execution/test/src/test_e2e.cc      |  42 ++---
 .../test/src/test_local_cost_estimator.cc     |   5 +-
 .../test/src/test_local_task_arg_accessor.cc  |   3 +-
 .../test/src/test_local_tensor_backing.cc     |   6 +-
 .../test/src/test_loss_functions.cc           |  17 +-
 .../test/src/test_task_registry.cc            |   2 +-
 .../test/src/test_unallocated_tensors.cc      |  11 +-
 lib/local-execution/test/src/test_update.cc   |  11 +-
 lib/models/src/models/dlrm/dlrm.cc            |  22 +--
 lib/op-attrs/include/op-attrs/datatype.h      |  33 ++--
 .../initializers/kaiming_initializer_mode.h   |   2 +-
 .../include/op-attrs/parallel_tensor_dims.h   |  10 +-
 .../include/op-attrs/parallel_tensor_shape.h  |  13 +-
 .../op-attrs/replica_parallel_dim_set.h       |   2 +-
 lib/op-attrs/src/op-attrs/datatype_value.cc   |  12 +-
 .../initializers/kaiming_initializer_mode.cc  |   2 +-
 lib/op-attrs/src/op-attrs/ops/attention.cc    |   9 +-
 lib/op-attrs/src/op-attrs/ops/batch_matmul.cc |   7 +-
 lib/op-attrs/src/op-attrs/ops/combine.cc      |   6 +-
 lib/op-attrs/src/op-attrs/ops/concat.cc       |   3 +-
 lib/op-attrs/src/op-attrs/ops/conv_2d.cc      |   6 +-
 lib/op-attrs/src/op-attrs/ops/linear.cc       |   4 +-
 lib/op-attrs/src/op-attrs/ops/pool_2d.cc      |   6 +-
 lib/op-attrs/src/op-attrs/ops/reduction.cc    |   5 +-
 .../src/op-attrs/parallel_tensor_dims.cc      |   3 +-
 .../src/op-attrs/parallel_tensor_shape.cc     |   7 +-
 .../src/op-attrs/replica_parallel_dim_set.cc  |   2 +-
 .../test/src/op-attrs/ops/attention.cc        |  30 +---
 lib/op-attrs/test/src/op-attrs/ops/cast.cc    |  10 +-
 lib/op-attrs/test/src/op-attrs/ops/combine.cc |   3 +-
 lib/op-attrs/test/src/op-attrs/ops/concat.cc  |   3 +-
 lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc |   3 +-
 .../test/src/op-attrs/ops/embedding.cc        |  10 +-
 lib/op-attrs/test/src/op-attrs/ops/linear.cc  |  15 +-
 lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc |   4 +-
 .../test/src/op-attrs/ops/reduction.cc        |   3 +-
 lib/op-attrs/test/src/op-attrs/tensor_dims.cc |  21 +--
 lib/pcg/include/pcg/machine_specification.h   |   4 +-
 lib/pcg/src/pcg/machine_specification.cc      |   4 +-
 lib/pcg/src/pcg/machine_view.cc               |   4 +-
 lib/pcg/src/pcg/operator_task_space.cc        |   3 +-
 .../materialize_operator_from_attrs_map.cc    |   3 +-
 .../task-spec/generic_task_impl_function.h    |   2 +-
 .../task-spec/init_op_task_impl_function.h    |   2 +-
 .../task-spec/itask_argument_accessor.h       |   2 +-
 .../include/task-spec/ops/attention.h         |   2 +-
 .../include/task-spec/ops/batch_matmul.h      |   2 +-
 .../include/task-spec/ops/batch_norm.h        |   2 +-
 lib/task-spec/include/task-spec/ops/cast.h    |   2 +-
 lib/task-spec/include/task-spec/ops/combine.h |   2 +-
 lib/task-spec/include/task-spec/ops/concat.h  |   2 +-
 lib/task-spec/include/task-spec/ops/conv_2d.h |   2 +-
 lib/task-spec/include/task-spec/ops/dropout.h |   2 +-
 .../include/task-spec/ops/element_binary.h    |   2 +-
 .../include/task-spec/ops/element_unary.h     |   2 +-
 .../include/task-spec/ops/embedding.h         |   2 +-
 lib/task-spec/include/task-spec/ops/flat.h    |   2 +-
 lib/task-spec/include/task-spec/ops/gather.h  |   2 +-
 .../include/task-spec/ops/layer_norm.h        |   2 +-
 lib/task-spec/include/task-spec/ops/linear.h  |   2 +-
 lib/task-spec/include/task-spec/ops/pool_2d.h |   2 +-
 lib/task-spec/include/task-spec/ops/reduce.h  |   2 +-
 .../include/task-spec/ops/reduction.h         |   2 +-
 .../include/task-spec/ops/repartition.h       |   2 +-
 .../include/task-spec/ops/replicate.h         |   2 +-
 lib/task-spec/include/task-spec/ops/reshape.h |   2 +-
 lib/task-spec/include/task-spec/ops/reverse.h |   2 +-
 lib/task-spec/include/task-spec/ops/softmax.h |   2 +-
 lib/task-spec/include/task-spec/ops/split.h   |   2 +-
 lib/task-spec/include/task-spec/ops/topk.h    |   2 +-
 .../include/task-spec/ops/transpose.h         |   2 +-
 .../task-spec/task_argument_accessor.h        |   2 +-
 .../include/task-spec/task_signature_impl.h   |   2 +-
 lib/task-spec/src/task-spec/ops/layer_norm.cc |   3 +-
 lib/task-spec/src/task-spec/ops/linear.cc     |   2 +-
 lib/task-spec/src/task-spec/ops/split.cc      |   3 +-
 lib/task-spec/src/task-spec/ops/topk.cc       |   3 +-
 lib/task-spec/test/src/task-spec/arg_ref.cc   |  12 +-
 .../src/utils/positive_int/positive_int.cc    |  19 +-
 lib/utils/test/src/utils/containers/sum.cc    |  11 +-
 .../src/utils/positive_int/positive_int.cc    |   2 +-
 154 files changed, 1106 insertions(+), 1114 deletions(-)

diff --git a/lib/compiler/src/compiler/allowed_machine_views.cc b/lib/compiler/src/compiler/allowed_machine_views.cc
index fa543e78b5..370cb5a4ec 100644
--- a/lib/compiler/src/compiler/allowed_machine_views.cc
+++ b/lib/compiler/src/compiler/allowed_machine_views.cc
@@ -17,10 +17,10 @@
 #include "utils/containers/unordered_multiset_of.h"
 #include "utils/containers/unordered_set_of.h"
 #include "utils/containers/zip.h"
-#include "utils/positive_int/ceildiv.h"
 #include "utils/nonnegative_int/nonnegative_range.h"
 #include "utils/nonnegative_int/num_elements.h"
 #include "utils/overload.h"
+#include "utils/positive_int/ceildiv.h"
 
 namespace FlexFlow {
 
@@ -57,7 +57,8 @@ static std::unordered_set<MachineView>
         product(transform(tensor_dims, [](positive_int num_devices) {
           return nonnegative_int{num_devices.int_from_positive_int() - 1};
         }));
-    return ceildiv(total_devices, positive_int{min_num_devices_with_full_stride_volume});
+    return ceildiv(total_devices,
+                   positive_int{min_num_devices_with_full_stride_volume});
   };
 
   auto candidate_strides = [&](std::vector<positive_int> const &tensor_dims,
@@ -66,9 +67,11 @@ static std::unordered_set<MachineView>
     positive_int max_stride_upper_bound =
         get_max_stride_upper_bound(tensor_dims, total_devices);
 
-    std::vector<stride_t> single_stride_range =
-        transform(nonnegative_range(1_n, max_stride_upper_bound.nonnegative_int_from_positive_int() + 1_n),
-                  [](nonnegative_int stride) { return stride_t{positive_int{stride}}; });
+    std::vector<stride_t> single_stride_range = transform(
+        nonnegative_range(
+            1_n,
+            max_stride_upper_bound.nonnegative_int_from_positive_int() + 1_n),
+        [](nonnegative_int stride) { return stride_t{positive_int{stride}}; });
     std::unordered_multiset<std::vector<stride_t>> raw_stride_vectors =
         cartesian_product(
             repeat_element(/*num_times=*/num_elements(tensor_dims),
@@ -83,9 +86,11 @@ static std::unordered_set<MachineView>
   auto candidate_starts = [](MachineSpecification const &ms,
                              DeviceType const &device_type) {
     std::unordered_set<MachineSpaceCoordinate> result;
-    for (nonnegative_int node_idx : nonnegative_range(ms.num_nodes.nonnegative_int_from_positive_int())) {
+    for (nonnegative_int node_idx :
+         nonnegative_range(ms.num_nodes.nonnegative_int_from_positive_int())) {
       for (nonnegative_int device_idx :
-           nonnegative_range(get_num_devices_per_node(ms, device_type).nonnegative_int_from_positive_int())) {
+           nonnegative_range(get_num_devices_per_node(ms, device_type)
+                                 .nonnegative_int_from_positive_int())) {
         result.insert(
             MachineSpaceCoordinate{node_idx, device_idx, device_type});
       }
diff --git a/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc b/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc
index 1c801161ca..1625d79f80 100644
--- a/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc
+++ b/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc
@@ -339,12 +339,11 @@ TEST_SUITE(FF_TEST_SUITE) {
       ComputationGraph cg = [&] {
         ComputationGraphBuilder b;
 
-        TensorShape input_shape =
-            TensorShape{TensorDims{FFOrdered{
-                            10_p,
-                            12_p,
-                        }},
-                        DataType::FLOAT};
+        TensorShape input_shape = TensorShape{TensorDims{FFOrdered{
+                                                  10_p,
+                                                  12_p,
+                                              }},
+                                              DataType::FLOAT};
         tensor_guid_t input = b.create_input(input_shape, CreateGrad::YES);
 
         b.dense(input, /*outDim=*/14_p);
diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h
index c24695298b..eb2a431bd1 100644
--- a/lib/kernels/include/kernels/accessor.h
+++ b/lib/kernels/include/kernels/accessor.h
@@ -226,7 +226,6 @@ bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1,
 bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1,
                               GenericTensorAccessorW const &acc2);
 
-
 bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
                              ArrayShape const &expected_shape,
                              DataType const &expected_dtype);
diff --git a/lib/kernels/include/kernels/array_coord.h b/lib/kernels/include/kernels/array_coord.h
index 84e68fa053..730bb49e81 100644
--- a/lib/kernels/include/kernels/array_coord.h
+++ b/lib/kernels/include/kernels/array_coord.h
@@ -5,8 +5,9 @@
 
 namespace FlexFlow {
 
-ArrayCoord array_coord_drop_dims(ArrayCoord const &coord,
-                                 std::function<bool(ff_dim_t)> const &should_drop_dim);
+ArrayCoord
+    array_coord_drop_dims(ArrayCoord const &coord,
+                          std::function<bool(ff_dim_t)> const &should_drop_dim);
 
 } // namespace FlexFlow
 
diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h
index 355b6e5bca..2b1397dc0e 100644
--- a/lib/kernels/include/kernels/array_shape.h
+++ b/lib/kernels/include/kernels/array_shape.h
@@ -61,8 +61,9 @@ TensorShape get_tensor_shape(ArrayShape const &, DataType);
 std::unordered_set<ff_dim_t> get_ff_dim_t_set(ArrayShape const &);
 std::unordered_set<ArrayCoord> get_array_coord_set(ArrayShape const &);
 
-ArrayShape array_shape_drop_dims(ArrayShape const &shape,
-                                 std::function<bool(ff_dim_t)> const &should_drop_dim);
+ArrayShape
+    array_shape_drop_dims(ArrayShape const &shape,
+                          std::function<bool(ff_dim_t)> const &should_drop_dim);
 
 } // namespace FlexFlow
 
diff --git a/lib/kernels/include/kernels/compare_tensor_accessors.h b/lib/kernels/include/kernels/compare_tensor_accessors.h
index ee438505fb..c16ae0857c 100644
--- a/lib/kernels/include/kernels/compare_tensor_accessors.h
+++ b/lib/kernels/include/kernels/compare_tensor_accessors.h
@@ -6,29 +6,35 @@
 
 namespace FlexFlow {
 
-GenericTensorAccessorW compare_tensor_accessors_lt(GenericTensorAccessorR const &lhs,
-                                                   GenericTensorAccessorR const &rhs,
-                                                   Allocator &allocator);
-
-GenericTensorAccessorW compare_tensor_accessors_le(GenericTensorAccessorR const &lhs,
-                                                   GenericTensorAccessorR const &rhs,
-                                                   Allocator &allocator);
-
-GenericTensorAccessorW compare_tensor_accessors_gt(GenericTensorAccessorR const &lhs,
-                                                   GenericTensorAccessorR const &rhs,
-                                                   Allocator &allocator);
-
-GenericTensorAccessorW compare_tensor_accessors_ge(GenericTensorAccessorR const &lhs,
-                                                   GenericTensorAccessorR const &rhs,
-                                                   Allocator &allocator);
-
-GenericTensorAccessorW compare_tensor_accessors_eq(GenericTensorAccessorR const &lhs,
-                                                   GenericTensorAccessorR const &rhs,
-                                                   Allocator &allocator);
-
-GenericTensorAccessorW compare_tensor_accessors_ne(GenericTensorAccessorR const &lhs,
-                                                   GenericTensorAccessorR const &rhs,
-                                                   Allocator &allocator);
+GenericTensorAccessorW
+    compare_tensor_accessors_lt(GenericTensorAccessorR const &lhs,
+                                GenericTensorAccessorR const &rhs,
+                                Allocator &allocator);
+
+GenericTensorAccessorW
+    compare_tensor_accessors_le(GenericTensorAccessorR const &lhs,
+                                GenericTensorAccessorR const &rhs,
+                                Allocator &allocator);
+
+GenericTensorAccessorW
+    compare_tensor_accessors_gt(GenericTensorAccessorR const &lhs,
+                                GenericTensorAccessorR const &rhs,
+                                Allocator &allocator);
+
+GenericTensorAccessorW
+    compare_tensor_accessors_ge(GenericTensorAccessorR const &lhs,
+                                GenericTensorAccessorR const &rhs,
+                                Allocator &allocator);
+
+GenericTensorAccessorW
+    compare_tensor_accessors_eq(GenericTensorAccessorR const &lhs,
+                                GenericTensorAccessorR const &rhs,
+                                Allocator &allocator);
+
+GenericTensorAccessorW
+    compare_tensor_accessors_ne(GenericTensorAccessorR const &lhs,
+                                GenericTensorAccessorR const &rhs,
+                                Allocator &allocator);
 
 } // namespace FlexFlow
 
diff --git a/lib/kernels/include/kernels/create_accessor_with_contents.h b/lib/kernels/include/kernels/create_accessor_with_contents.h
index 966a7a30ad..9691b0c90a 100644
--- a/lib/kernels/include/kernels/create_accessor_with_contents.h
+++ b/lib/kernels/include/kernels/create_accessor_with_contents.h
@@ -23,7 +23,8 @@ GenericTensorAccessorW
   Allocator cpu_allocator = create_local_cpu_memory_allocator();
   GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape);
 
-  for (nonnegative_int col_idx : nonnegative_range(ncols.nonnegative_int_from_positive_int())) {
+  for (nonnegative_int col_idx :
+       nonnegative_range(ncols.nonnegative_int_from_positive_int())) {
     cpu_accessor.at<type_to_data_type_enum_v<T>>(FFOrdered{col_idx}) =
         contents.at(col_idx.unwrap_nonnegative());
   }
@@ -53,11 +54,13 @@ GenericTensorAccessorW create_2d_accessor_w_with_contents(
   Allocator cpu_allocator = create_local_cpu_memory_allocator();
   GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape);
 
-  for (nonnegative_int row_idx : nonnegative_range(nrows.nonnegative_int_from_positive_int())) {
-    for (nonnegative_int col_idx : nonnegative_range(ncols.nonnegative_int_from_positive_int())) {
-      cpu_accessor.at<type_to_data_type_enum_v<T>>(FFOrdered{row_idx, col_idx}) =
-          contents.at(row_idx.unwrap_nonnegative())
-              .at(col_idx.unwrap_nonnegative());
+  for (nonnegative_int row_idx :
+       nonnegative_range(nrows.nonnegative_int_from_positive_int())) {
+    for (nonnegative_int col_idx :
+         nonnegative_range(ncols.nonnegative_int_from_positive_int())) {
+      cpu_accessor.at<type_to_data_type_enum_v<T>>(FFOrdered{
+          row_idx, col_idx}) = contents.at(row_idx.unwrap_nonnegative())
+                                   .at(col_idx.unwrap_nonnegative());
     }
   }
 
@@ -95,9 +98,12 @@ GenericTensorAccessorW create_3d_accessor_w_with_contents(
   Allocator cpu_allocator = create_local_cpu_memory_allocator();
   GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape);
 
-  for (nonnegative_int dim0_idx : nonnegative_range(dim0_size.nonnegative_int_from_positive_int())) {
-    for (nonnegative_int dim1_idx : nonnegative_range(dim1_size.nonnegative_int_from_positive_int())) {
-      for (nonnegative_int dim2_idx : nonnegative_range(dim2_size.nonnegative_int_from_positive_int())) {
+  for (nonnegative_int dim0_idx :
+       nonnegative_range(dim0_size.nonnegative_int_from_positive_int())) {
+    for (nonnegative_int dim1_idx :
+         nonnegative_range(dim1_size.nonnegative_int_from_positive_int())) {
+      for (nonnegative_int dim2_idx :
+           nonnegative_range(dim2_size.nonnegative_int_from_positive_int())) {
         cpu_accessor.at<type_to_data_type_enum_v<T>>(
             FFOrdered{dim0_idx, dim1_idx, dim2_idx}) =
             contents.at(dim0_idx.unwrap_nonnegative())
@@ -151,10 +157,14 @@ GenericTensorAccessorW create_4d_accessor_w_with_contents(
 
   GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
 
-  for (nonnegative_int dim0_idx : nonnegative_range(dim0_size.nonnegative_int_from_positive_int())) {
-    for (nonnegative_int dim1_idx : nonnegative_range(dim1_size.nonnegative_int_from_positive_int())) {
-      for (nonnegative_int dim2_idx : nonnegative_range(dim2_size.nonnegative_int_from_positive_int())) {
-        for (nonnegative_int dim3_idx : nonnegative_range(dim3_size.nonnegative_int_from_positive_int())) {
+  for (nonnegative_int dim0_idx :
+       nonnegative_range(dim0_size.nonnegative_int_from_positive_int())) {
+    for (nonnegative_int dim1_idx :
+         nonnegative_range(dim1_size.nonnegative_int_from_positive_int())) {
+      for (nonnegative_int dim2_idx :
+           nonnegative_range(dim2_size.nonnegative_int_from_positive_int())) {
+        for (nonnegative_int dim3_idx :
+             nonnegative_range(dim3_size.nonnegative_int_from_positive_int())) {
           accessor.at<type_to_data_type_enum_v<T>>(
               FFOrdered{dim0_idx, dim1_idx, dim2_idx, dim3_idx}) =
               contents.at(dim0_idx.unwrap_nonnegative())
diff --git a/lib/kernels/include/kernels/fill_tensor_accessor.h b/lib/kernels/include/kernels/fill_tensor_accessor.h
index 8db63f5a2d..b10345933f 100644
--- a/lib/kernels/include/kernels/fill_tensor_accessor.h
+++ b/lib/kernels/include/kernels/fill_tensor_accessor.h
@@ -9,13 +9,11 @@ namespace FlexFlow {
 
 void fill_tensor_accessor(GenericTensorAccessorW &, DataTypeValue val);
 
-GenericTensorAccessorW create_accessor_w_filled_with(TensorShape const &shape,
-                                                     DataTypeValue val,
-                                                     Allocator const &allocator);
+GenericTensorAccessorW create_accessor_w_filled_with(
+    TensorShape const &shape, DataTypeValue val, Allocator const &allocator);
 
-GenericTensorAccessorR create_accessor_r_filled_with(TensorShape const &shape,
-                                                     DataTypeValue val,
-                                                     Allocator const &allocator);
+GenericTensorAccessorR create_accessor_r_filled_with(
+    TensorShape const &shape, DataTypeValue val, Allocator const &allocator);
 
 } // namespace FlexFlow
 
diff --git a/lib/kernels/include/kernels/legion_dim.h b/lib/kernels/include/kernels/legion_dim.h
index 63c6ddb3c6..796423102b 100644
--- a/lib/kernels/include/kernels/legion_dim.h
+++ b/lib/kernels/include/kernels/legion_dim.h
@@ -7,9 +7,9 @@
 #include "op-attrs/ff_ordered/ff_ordered.h"
 #include "utils/containers/set_of.h"
 #include "utils/containers/transform.h"
-#include "utils/positive_int/positive_int.h"
-#include "utils/nonnegative_int/num_elements.h"
 #include "utils/nonnegative_int/nonnegative_range.h"
+#include "utils/nonnegative_int/num_elements.h"
+#include "utils/positive_int/positive_int.h"
 
 namespace FlexFlow {
 
diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h
index d409ec19ad..287369a202 100644
--- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h
+++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h
@@ -33,12 +33,14 @@ struct ManagedPerDeviceFFHandle {
   PerDeviceFFHandle *handle;
 };
 
-ManagedPerDeviceFFHandle initialize_single_gpu_handle(size_t workSpaceSize, 
-                                                      bool allowTensorOpMathConversion);
-ManagedPerDeviceFFHandle initialize_multi_gpu_handle(int num_ranks,
-                                                     int my_rank,
-                                                     size_t workSpaceSize,
-                                                     bool allowTensorOpMathConversion);
+ManagedPerDeviceFFHandle
+    initialize_single_gpu_handle(size_t workSpaceSize,
+                                 bool allowTensorOpMathConversion);
+ManagedPerDeviceFFHandle
+    initialize_multi_gpu_handle(int num_ranks,
+                                int my_rank,
+                                size_t workSpaceSize,
+                                bool allowTensorOpMathConversion);
 
 } // namespace FlexFlow
 
diff --git a/lib/kernels/include/kernels/map_tensor_accessors.h b/lib/kernels/include/kernels/map_tensor_accessors.h
index eed17cbb61..2933a611cf 100644
--- a/lib/kernels/include/kernels/map_tensor_accessors.h
+++ b/lib/kernels/include/kernels/map_tensor_accessors.h
@@ -3,11 +3,11 @@
 
 #include "kernels/accessor.h"
 #include "kernels/allocation.h"
-#include "kernels/local_cpu_allocator.h"
 #include "kernels/copy_tensor_accessor.h"
 #include "kernels/datatype_dispatch.h"
-#include "utils/containers/require_same.h"
+#include "kernels/local_cpu_allocator.h"
 #include "utils/containers/require_all_same1.h"
+#include "utils/containers/require_same.h"
 #include <type_traits>
 
 namespace FlexFlow {
@@ -15,23 +15,21 @@ namespace FlexFlow {
 template <DataType DT>
 struct CPUMapTensorAccessorInPlace {
   template <typename F>
-  void operator()(GenericTensorAccessorW &accessor,
-                  F &&f) {
+  void operator()(GenericTensorAccessorW &accessor, F &&f) {
     ASSERT(accessor.device_type == DeviceType::CPU);
 
     for (ArrayCoord const &coord : get_array_coord_set(accessor.shape)) {
-      accessor.at<DT>(coord.ff_ordered) 
-        = f(accessor.at<DT>(coord.ff_ordered));
+      accessor.at<DT>(coord.ff_ordered) = f(accessor.at<DT>(coord.ff_ordered));
     }
   }
 };
 
 template <typename F>
-void map_tensor_accessor_inplace(GenericTensorAccessorW &accessor,
-                                 F &&f) {
+void map_tensor_accessor_inplace(GenericTensorAccessorW &accessor, F &&f) {
   ASSERT(accessor.device_type == DeviceType::CPU);
 
-  DataTypeDispatch1<CPUMapTensorAccessorInPlace>{}(accessor.data_type, accessor, f);
+  DataTypeDispatch1<CPUMapTensorAccessorInPlace>{}(
+      accessor.data_type, accessor, f);
 }
 
 template <DataType DT>
@@ -47,9 +45,8 @@ struct CPUMapTensorAccessor {
 
     for (ArrayCoord const &coord : get_array_coord_set(shape)) {
       output.at<
-        type_to_data_type_enum_v<std::invoke_result_t<F, real_type_t<DT>>>
-      >(coord.ff_ordered) 
-        = f(input.at<DT>(coord.ff_ordered));
+          type_to_data_type_enum_v<std::invoke_result_t<F, real_type_t<DT>>>>(
+          coord.ff_ordered) = f(input.at<DT>(coord.ff_ordered));
     }
   }
 };
@@ -59,30 +56,32 @@ GenericTensorAccessorW map_tensor_accessor(GenericTensorAccessorR const &input,
                                            F &&f,
                                            Allocator &output_allocator) {
   Allocator cpu_allocator = create_local_cpu_memory_allocator();
-  GenericTensorAccessorR input_cpu = copy_tensor_accessor_r_to_cpu_if_necessary(input, cpu_allocator);
+  GenericTensorAccessorR input_cpu =
+      copy_tensor_accessor_r_to_cpu_if_necessary(input, cpu_allocator);
 
-  GenericTensorAccessorW output_cpu = cpu_allocator.allocate_tensor(get_tensor_shape(input.shape, type_to_data_type_enum_v<Out>));
+  GenericTensorAccessorW output_cpu = cpu_allocator.allocate_tensor(
+      get_tensor_shape(input.shape, type_to_data_type_enum_v<Out>));
 
-  DataTypeDispatch1<CPUMapTensorAccessor>{}(input.data_type, input_cpu, output_cpu, f);
+  DataTypeDispatch1<CPUMapTensorAccessor>{}(
+      input.data_type, input_cpu, output_cpu, f);
 
   return copy_tensor_accessor_w(output_cpu, output_allocator);
 }
 
 template <DataType DTL, DataType DTR>
 struct CPUMapTensorAccessors2 {
-  template <
-    typename F, 
-    typename Out = std::invoke_result_t<F, real_type_t<DTL>, real_type_t<DTR>>
-  >
+  template <typename F,
+            typename Out =
+                std::invoke_result_t<F, real_type_t<DTL>, real_type_t<DTR>>>
   void operator()(GenericTensorAccessorR const &lhs,
                   GenericTensorAccessorR const &rhs,
                   GenericTensorAccessorW &output,
                   F &&f) {
 
     ArrayShape shape = throw_if_unexpected(require_all_same1(std::vector{
-      lhs.shape,
-      rhs.shape,
-      output.shape,
+        lhs.shape,
+        rhs.shape,
+        output.shape,
     }));
 
     ASSERT(lhs.device_type == DeviceType::CPU);
@@ -90,8 +89,8 @@ struct CPUMapTensorAccessors2 {
     ASSERT(output.device_type == DeviceType::CPU);
 
     for (ArrayCoord const &coord : get_array_coord_set(shape)) {
-      output.at<type_to_data_type_enum_v<Out>>(coord.ff_ordered) 
-        = f(lhs.at<DTL>(coord.ff_ordered), rhs.at<DTR>(coord.ff_ordered));
+      output.at<type_to_data_type_enum_v<Out>>(coord.ff_ordered) =
+          f(lhs.at<DTL>(coord.ff_ordered), rhs.at<DTR>(coord.ff_ordered));
     }
   }
 };
@@ -105,16 +104,19 @@ GenericTensorAccessorW map_tensor_accessors2(GenericTensorAccessorR const &lhs,
   ArrayShape shape = require_same(lhs.shape, rhs.shape);
 
   Allocator cpu_allocator = create_local_cpu_memory_allocator();
-  GenericTensorAccessorR lhs_cpu = copy_tensor_accessor_r_to_cpu_if_necessary(lhs, cpu_allocator);
-  GenericTensorAccessorR rhs_cpu = copy_tensor_accessor_r_to_cpu_if_necessary(rhs, cpu_allocator);
-  GenericTensorAccessorW output_cpu = cpu_allocator.allocate_tensor(get_tensor_shape(shape, output_data_type));
+  GenericTensorAccessorR lhs_cpu =
+      copy_tensor_accessor_r_to_cpu_if_necessary(lhs, cpu_allocator);
+  GenericTensorAccessorR rhs_cpu =
+      copy_tensor_accessor_r_to_cpu_if_necessary(rhs, cpu_allocator);
+  GenericTensorAccessorW output_cpu =
+      cpu_allocator.allocate_tensor(get_tensor_shape(shape, output_data_type));
 
-  DataTypeDispatch2<CPUMapTensorAccessors2>{}(lhs.data_type, rhs.data_type, lhs_cpu, rhs_cpu, output_cpu, f);
+  DataTypeDispatch2<CPUMapTensorAccessors2>{}(
+      lhs.data_type, rhs.data_type, lhs_cpu, rhs_cpu, output_cpu, f);
 
   return copy_tensor_accessor_w(output_cpu, output_allocator);
 }
 
-
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/reduce_tensor_accessor.h b/lib/kernels/include/kernels/reduce_tensor_accessor.h
index 4be375299f..d803c7ef9b 100644
--- a/lib/kernels/include/kernels/reduce_tensor_accessor.h
+++ b/lib/kernels/include/kernels/reduce_tensor_accessor.h
@@ -4,15 +4,15 @@
 #include "kernels/accessor.h"
 #include "kernels/allocation.h"
 #include "kernels/array_coord.h"
+#include "kernels/copy_tensor_accessor.h"
+#include "kernels/datatype_dispatch.h"
+#include "kernels/local_cpu_allocator.h"
 #include "utils/containers/contains.h"
-#include "utils/containers/sorted.h"
-#include "utils/containers/group_by.h"
-#include "utils/containers/transform.h"
 #include "utils/containers/foldl1.h"
 #include "utils/containers/foldr1.h"
-#include "kernels/local_cpu_allocator.h"
-#include "kernels/copy_tensor_accessor.h"
-#include "kernels/datatype_dispatch.h"
+#include "utils/containers/group_by.h"
+#include "utils/containers/sorted.h"
+#include "utils/containers/transform.h"
 
 namespace FlexFlow {
 
@@ -32,18 +32,24 @@ struct CPUReduceTensorAccessorInDims {
       return contains(dims_to_reduce, dim);
     };
 
-    std::unordered_map<ArrayCoord, std::unordered_set<ArrayCoord>> output_coord_from_input_coord
-      = group_by(get_array_coord_set(input.shape), 
-                 [&](ArrayCoord const &input_coord) { return array_coord_drop_dims(input_coord, should_drop_dim); });
+    std::unordered_map<ArrayCoord, std::unordered_set<ArrayCoord>>
+        output_coord_from_input_coord = group_by(
+            get_array_coord_set(input.shape),
+            [&](ArrayCoord const &input_coord) {
+              return array_coord_drop_dims(input_coord, should_drop_dim);
+            });
 
-    for (auto const &[output_coord, input_coords] : output_coord_from_input_coord) {
-      std::vector<T> input_values = transform(sorted(input_coords),
-                                                 [&](ArrayCoord const &input_coord) -> T { 
-                                                   return input.at<DT>(input_coord.ff_ordered);
-                                                 });
+    for (auto const &[output_coord, input_coords] :
+         output_coord_from_input_coord) {
+      std::vector<T> input_values = transform(
+          sorted(input_coords), [&](ArrayCoord const &input_coord) -> T {
+            return input.at<DT>(input_coord.ff_ordered);
+          });
 
       T result = foldl1(input_values, f);
-      ASSERT(result == foldr1(input_values, [&](T const &accum, T const &elem) { return f(elem, accum); }));
+      ASSERT(result == foldr1(input_values, [&](T const &accum, T const &elem) {
+               return f(elem, accum);
+             }));
 
       output.at<DT>(output_coord.ff_ordered) = result;
     }
@@ -51,34 +57,40 @@ struct CPUReduceTensorAccessorInDims {
 };
 
 template <typename F>
-GenericTensorAccessorW reduce_tensor_accessor_in_dims(
-  GenericTensorAccessorR const &input,
-  std::unordered_set<ff_dim_t> const &dims,
-  Allocator &output_allocator,
-  F &&f) {
+GenericTensorAccessorW
+    reduce_tensor_accessor_in_dims(GenericTensorAccessorR const &input,
+                                   std::unordered_set<ff_dim_t> const &dims,
+                                   Allocator &output_allocator,
+                                   F &&f) {
 
   Allocator cpu_allocator = create_local_cpu_memory_allocator();
-  GenericTensorAccessorR input_cpu = copy_tensor_accessor_r_to_cpu_if_necessary(input, cpu_allocator);
+  GenericTensorAccessorR input_cpu =
+      copy_tensor_accessor_r_to_cpu_if_necessary(input, cpu_allocator);
 
   auto should_drop_dim = [&](ff_dim_t dim) -> bool {
     return contains(dims, dim);
   };
 
-  ArrayShape reduced_shape = array_shape_drop_dims(input.shape, should_drop_dim);
-  GenericTensorAccessorW output_cpu = cpu_allocator.allocate_tensor(get_tensor_shape(reduced_shape, input.data_type));
+  ArrayShape reduced_shape =
+      array_shape_drop_dims(input.shape, should_drop_dim);
+  GenericTensorAccessorW output_cpu = cpu_allocator.allocate_tensor(
+      get_tensor_shape(reduced_shape, input.data_type));
 
-  DataTypeDispatch1<CPUReduceTensorAccessorInDims>{}(input_cpu.data_type, input_cpu, output_cpu, dims, f);
+  DataTypeDispatch1<CPUReduceTensorAccessorInDims>{}(
+      input_cpu.data_type, input_cpu, output_cpu, dims, f);
 
   return copy_tensor_accessor_w(output_cpu, output_allocator);
 }
 
 template <DataType DT, typename F>
-real_type_t<DT> reduce_tensor_accessor_in_all_dims(GenericTensorAccessorR const &input,
-                                                      F &&f) {
+real_type_t<DT>
+    reduce_tensor_accessor_in_all_dims(GenericTensorAccessorR const &input,
+                                       F &&f) {
   Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
   std::unordered_set<ff_dim_t> input_dims = get_ff_dim_t_set(input.shape);
-  GenericTensorAccessorW reduced = reduce_tensor_accessor_in_dims(input, input_dims, cpu_allocator, f);
+  GenericTensorAccessorW reduced =
+      reduce_tensor_accessor_in_dims(input, input_dims, cpu_allocator, f);
 
   return accessor_get_only_value<DT>(reduced);
 }
diff --git a/lib/kernels/src/cpu/ops/combine_kernels.cc b/lib/kernels/src/cpu/ops/combine_kernels.cc
index 557f523f17..c0c856ae5b 100644
--- a/lib/kernels/src/cpu/ops/combine_kernels.cc
+++ b/lib/kernels/src/cpu/ops/combine_kernels.cc
@@ -18,7 +18,8 @@ template <DataType DT>
 struct CPUBackwardKernel {
   void operator()(GenericTensorAccessorR const &output_grad,
                   GenericTensorAccessorW const &input_grad) {
-    size_t num_elements = output_grad.shape.num_elements().int_from_positive_int();
+    size_t num_elements =
+        output_grad.shape.num_elements().int_from_positive_int();
     for (int i = 0; i < num_elements; ++i) {
       input_grad.get<DT>()[i] += output_grad.get<DT>()[i];
     }
diff --git a/lib/kernels/src/cpu/ops/replicate_kernels.cc b/lib/kernels/src/cpu/ops/replicate_kernels.cc
index d97a274d80..bc9c4eab0d 100644
--- a/lib/kernels/src/cpu/ops/replicate_kernels.cc
+++ b/lib/kernels/src/cpu/ops/replicate_kernels.cc
@@ -23,8 +23,8 @@ struct CPUBackwardKernel {
                   nonnegative_int num_replicas) {
     using T = real_type_t<DT>;
 
-    for (nonnegative_int i : 
-           nonnegative_range(num_elements.nonnegative_int_from_positive_int())) {
+    for (nonnegative_int i :
+         nonnegative_range(num_elements.nonnegative_int_from_positive_int())) {
       T cur_sum = 0;
       for (nonnegative_int replica_idx : nonnegative_range(num_replicas)) {
         cur_sum += output.at<DT>(LegionOrdered{replica_idx, i});
diff --git a/lib/kernels/src/cuda/ops/combine_kernels.cu b/lib/kernels/src/cuda/ops/combine_kernels.cu
index 4920696756..f091a69b71 100644
--- a/lib/kernels/src/cuda/ops/combine_kernels.cu
+++ b/lib/kernels/src/cuda/ops/combine_kernels.cu
@@ -27,12 +27,13 @@ struct ForwardKernel {
   void operator()(ffStream_t stream,
                   GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &output) {
-    checkCUDA(cudaMemcpyAsync(output.get<DT>(),
-                              input.get<DT>(),
-                              input.shape.num_elements().int_from_positive_int() *
-                                  size_of_datatype(DT).int_from_positive_int(),
-                              cudaMemcpyDeviceToDevice,
-                              stream));
+    checkCUDA(
+        cudaMemcpyAsync(output.get<DT>(),
+                        input.get<DT>(),
+                        input.shape.num_elements().int_from_positive_int() *
+                            size_of_datatype(DT).int_from_positive_int(),
+                        cudaMemcpyDeviceToDevice,
+                        stream));
   }
 };
 
@@ -41,7 +42,8 @@ struct BackwardKernel {
   void operator()(ffStream_t stream,
                   GenericTensorAccessorR const &output_grad,
                   GenericTensorAccessorW const &input_grad) {
-    size_t num_elements = output_grad.shape.num_elements().int_from_positive_int();
+    size_t num_elements =
+        output_grad.shape.num_elements().int_from_positive_int();
     add_kernel<real_type_t<DT>>
         <<<GET_BLOCKS(num_elements), CUDA_NUM_THREADS, 0, stream>>>(
             input_grad.get<DT>(), output_grad.get<DT>(), num_elements);
diff --git a/lib/kernels/src/cuda/ops/gather_kernels.cu b/lib/kernels/src/cuda/ops/gather_kernels.cu
index bee8f68eef..e251a57f8a 100644
--- a/lib/kernels/src/cuda/ops/gather_kernels.cu
+++ b/lib/kernels/src/cuda/ops/gather_kernels.cu
@@ -132,7 +132,8 @@ void forward_kernel(ffStream_t stream,
     stride = 1;
   }
 
-  coord_t output_dim_size = output.shape.at(m.legion_dim).int_from_positive_int();
+  coord_t output_dim_size =
+      output.shape.at(m.legion_dim).int_from_positive_int();
   coord_t input_dim_size = input.shape.at(m.legion_dim).int_from_positive_int();
 
   assert(index.data_type == DataType::INT32 ||
diff --git a/lib/kernels/src/cuda/ops/partition_kernels.cu b/lib/kernels/src/cuda/ops/partition_kernels.cu
index e4a83a12c8..94690a74fb 100644
--- a/lib/kernels/src/cuda/ops/partition_kernels.cu
+++ b/lib/kernels/src/cuda/ops/partition_kernels.cu
@@ -27,12 +27,13 @@ struct ForwardKernel {
                   RepartitionPerDeviceState const &m,
                   GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &output) {
-    checkCUDA(cudaMemcpyAsync(output.get<T>(),
-                              input.get<T>(),
-                              input.shape.num_elements().int_from_positive_int() *
-                                  size_of_datatype(T).int_from_positive_int(),
-                              cudaMemcpyDeviceToDevice,
-                              stream));
+    checkCUDA(
+        cudaMemcpyAsync(output.get<T>(),
+                        input.get<T>(),
+                        input.shape.num_elements().int_from_positive_int() *
+                            size_of_datatype(T).int_from_positive_int(),
+                        cudaMemcpyDeviceToDevice,
+                        stream));
   }
 };
 
diff --git a/lib/kernels/src/cuda/ops/reduction_kernels.cu b/lib/kernels/src/cuda/ops/reduction_kernels.cu
index ac3b7c9b08..93400d333f 100644
--- a/lib/kernels/src/cuda/ops/reduction_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reduction_kernels.cu
@@ -57,12 +57,13 @@ struct BackwardKernel {
   void operator()(cudaStream_t stream,
                   GenericTensorAccessorR const &output,
                   GenericTensorAccessorW const &input) {
-    checkCUDA(cudaMemcpyAsync(input.get<T>(),
-                              output.get<T>(),
-                              input.shape.num_elements().int_from_positive_int() *
-                                  size_of_datatype(T).int_from_positive_int(),
-                              cudaMemcpyDeviceToDevice,
-                              stream));
+    checkCUDA(
+        cudaMemcpyAsync(input.get<T>(),
+                        output.get<T>(),
+                        input.shape.num_elements().int_from_positive_int() *
+                            size_of_datatype(T).int_from_positive_int(),
+                        cudaMemcpyDeviceToDevice,
+                        stream));
   }
 };
 
diff --git a/lib/kernels/src/cuda/ops/replicate_kernels.cu b/lib/kernels/src/cuda/ops/replicate_kernels.cu
index 23e65cc1f3..9f532c96b1 100644
--- a/lib/kernels/src/cuda/ops/replicate_kernels.cu
+++ b/lib/kernels/src/cuda/ops/replicate_kernels.cu
@@ -38,12 +38,13 @@ struct ForwardKernel {
   void operator()(cudaStream_t stream,
                   GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &output) {
-    checkCUDA(cudaMemcpyAsync((void *)output.get<T>(),
-                              (void *)input.get<T>(),
-                              input.shape.num_elements().int_from_positive_int() *
-                                  size_of_datatype(T).int_from_positive_int(),
-                              cudaMemcpyDeviceToDevice,
-                              stream));
+    checkCUDA(
+        cudaMemcpyAsync((void *)output.get<T>(),
+                        (void *)input.get<T>(),
+                        input.shape.num_elements().int_from_positive_int() *
+                            size_of_datatype(T).int_from_positive_int(),
+                        cudaMemcpyDeviceToDevice,
+                        stream));
   }
 };
 
diff --git a/lib/kernels/src/cuda/ops/reshape_kernels.cu b/lib/kernels/src/cuda/ops/reshape_kernels.cu
index 06aa8d74b2..3f0d6bb15a 100644
--- a/lib/kernels/src/cuda/ops/reshape_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reshape_kernels.cu
@@ -31,12 +31,13 @@ struct ForwardKernel {
   void operator()(cudaStream_t stream,
                   GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &output) {
-    checkCUDA(cudaMemcpyAsync(output.get<T>(),
-                              input.get<T>(),
-                              input.shape.num_elements().int_from_positive_int() *
-                                  size_of_datatype(T).int_from_positive_int(),
-                              cudaMemcpyDeviceToDevice,
-                              stream));
+    checkCUDA(
+        cudaMemcpyAsync(output.get<T>(),
+                        input.get<T>(),
+                        input.shape.num_elements().int_from_positive_int() *
+                            size_of_datatype(T).int_from_positive_int(),
+                        cudaMemcpyDeviceToDevice,
+                        stream));
   }
 };
 
diff --git a/lib/kernels/src/cuda/ops/transpose_kernels.cu b/lib/kernels/src/cuda/ops/transpose_kernels.cu
index 13162a9888..4e3c69eedf 100644
--- a/lib/kernels/src/cuda/ops/transpose_kernels.cu
+++ b/lib/kernels/src/cuda/ops/transpose_kernels.cu
@@ -76,8 +76,8 @@ void forward_kernel(cudaStream_t stream,
       info.in_strides[i] = 1;
       info.out_strides[i] = 1;
     } else {
-      int in_dim_size =
-          input.shape.at(legion_dim_t{nonnegative_int{i}}).int_from_positive_int();
+      int in_dim_size = input.shape.at(legion_dim_t{nonnegative_int{i}})
+                            .int_from_positive_int();
       int out_dim_size = output.shape.at(legion_dim_t{nonnegative_int{i}})
                              .int_from_positive_int();
       info.in_strides[i] = info.in_strides[i - 1] * in_dim_size;
diff --git a/lib/kernels/src/cuda/optimizer_kernels.cu b/lib/kernels/src/cuda/optimizer_kernels.cu
index e1ab7eb92c..2fce3c5db9 100644
--- a/lib/kernels/src/cuda/optimizer_kernels.cu
+++ b/lib/kernels/src/cuda/optimizer_kernels.cu
@@ -168,17 +168,17 @@ __host__ void adam_ps_update_task_gpu(ffStream_t stream,
 
 #ifdef FF_USE_NCCL
 __host__ void adam_nccl_update_task_gpu(ffStream_t stream,
-                                   float alpha_t,
-                                   float beta1,
-                                   float beta2,
-                                   float weight_decay,
-                                   float epsilon,
-                                   PerDeviceFFHandle const &handle,
-                                   float const *w_grad_ptr,
-                                   size_t size,
-                                   float *w_ptr,
-                                   float *v_ptr,
-                                   float *m_ptr) {
+                                        float alpha_t,
+                                        float beta1,
+                                        float beta2,
+                                        float weight_decay,
+                                        float epsilon,
+                                        PerDeviceFFHandle const &handle,
+                                        float const *w_grad_ptr,
+                                        size_t size,
+                                        float *w_ptr,
+                                        float *v_ptr,
+                                        float *m_ptr) {
   // Step 1: Use NCCL to sync gradients
   checkNCCL(ncclAllReduce(w_grad_ptr,
                           (float *)w_grad_ptr,
diff --git a/lib/kernels/src/kernels/accessor.cc b/lib/kernels/src/kernels/accessor.cc
index 46137c3c9c..5a1881eb66 100644
--- a/lib/kernels/src/kernels/accessor.cc
+++ b/lib/kernels/src/kernels/accessor.cc
@@ -266,14 +266,13 @@ std::vector<half *>
   return get<DataType::HALF>(a);
 }
 
-
 GenericTensorAccessorR read_only_accessor_from_write_accessor(
     GenericTensorAccessorW const &writable) {
   return GenericTensorAccessorR{
-    writable.data_type,
-    writable.shape,
-    writable.ptr,
-    writable.device_type,
+      writable.data_type,
+      writable.shape,
+      writable.ptr,
+      writable.device_type,
   };
 }
 
@@ -311,7 +310,7 @@ std::pair<ArrayShape, DataType>
   return std::make_pair(accessor.shape, accessor.data_type);
 }
 
-template
-  int32_t accessor_get_only_value<DataType::INT32>(GenericTensorAccessorR const &);
+template int32_t
+    accessor_get_only_value<DataType::INT32>(GenericTensorAccessorR const &);
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/array_coord.cc b/lib/kernels/src/kernels/array_coord.cc
index 60bb19351c..0927cb9951 100644
--- a/lib/kernels/src/kernels/array_coord.cc
+++ b/lib/kernels/src/kernels/array_coord.cc
@@ -5,8 +5,9 @@
 
 namespace FlexFlow {
 
-ArrayCoord array_coord_drop_dims(ArrayCoord const &coord,
-                                 std::function<bool(ff_dim_t)> const &should_drop_dim) {
+ArrayCoord array_coord_drop_dims(
+    ArrayCoord const &coord,
+    std::function<bool(ff_dim_t)> const &should_drop_dim) {
   std::vector<nonnegative_int> result;
   for (ff_dim_t idx : get_idxs(coord.ff_ordered)) {
     if (!should_drop_dim(idx)) {
diff --git a/lib/kernels/src/kernels/array_shape.cc b/lib/kernels/src/kernels/array_shape.cc
index 18b8861164..a1fb9bf09b 100644
--- a/lib/kernels/src/kernels/array_shape.cc
+++ b/lib/kernels/src/kernels/array_shape.cc
@@ -1,6 +1,7 @@
 #include "kernels/array_shape.h"
 #include "kernels/legion_ordered/slice.h"
 #include "op-attrs/ff_ordered/ff_ordered_of.h"
+#include "op-attrs/ff_ordered/get_idxs.h"
 #include "op-attrs/ff_ordered/slice.h"
 #include "utils/containers/cartesian_product.h"
 #include "utils/containers/product.h"
@@ -10,9 +11,8 @@
 #include "utils/containers/vector_of.h"
 #include "utils/hash/tuple.h"
 #include "utils/hash/vector.h"
-#include "utils/nonnegative_int/num_elements.h"
-#include "op-attrs/ff_ordered/get_idxs.h"
 #include "utils/nonnegative_int/nonnegative_range.h"
+#include "utils/nonnegative_int/num_elements.h"
 
 namespace FlexFlow {
 
@@ -109,11 +109,11 @@ std::unordered_set<ff_dim_t> get_ff_dim_t_set(ArrayShape const &shape) {
 }
 
 std::unordered_set<ArrayCoord> get_array_coord_set(ArrayShape const &shape) {
-  std::vector<std::vector<nonnegative_int>> per_dim_ranges =
-      transform(vector_of(ff_ordered_from_legion_ordered(shape.dims)),
-                [](positive_int dim_size) -> std::vector<nonnegative_int> {
-                  return nonnegative_range(dim_size.nonnegative_int_from_positive_int());
-                });
+  std::vector<std::vector<nonnegative_int>> per_dim_ranges = transform(
+      vector_of(ff_ordered_from_legion_ordered(shape.dims)),
+      [](positive_int dim_size) -> std::vector<nonnegative_int> {
+        return nonnegative_range(dim_size.nonnegative_int_from_positive_int());
+      });
 
   std::unordered_set<std::vector<nonnegative_int>> raw_points =
       unordered_set_of(cartesian_product(per_dim_ranges));
@@ -124,8 +124,9 @@ std::unordered_set<ArrayCoord> get_array_coord_set(ArrayShape const &shape) {
                    });
 }
 
-ArrayShape array_shape_drop_dims(ArrayShape const &shape,
-                                 std::function<bool(ff_dim_t)> const &should_drop_dim) {
+ArrayShape array_shape_drop_dims(
+    ArrayShape const &shape,
+    std::function<bool(ff_dim_t)> const &should_drop_dim) {
   std::vector<positive_int> result;
   for (ff_dim_t idx : get_idxs(ff_ordered_from_legion_ordered(shape.dims))) {
     if (!should_drop_dim(idx)) {
diff --git a/lib/kernels/src/kernels/compare_tensor_accessors.cc b/lib/kernels/src/kernels/compare_tensor_accessors.cc
index b1f5fd39b7..9fa9865c16 100644
--- a/lib/kernels/src/kernels/compare_tensor_accessors.cc
+++ b/lib/kernels/src/kernels/compare_tensor_accessors.cc
@@ -3,60 +3,76 @@
 
 namespace FlexFlow {
 
-GenericTensorAccessorW compare_tensor_accessors_lt(GenericTensorAccessorR const &lhs,
-                                                   GenericTensorAccessorR const &rhs,
-                                                   Allocator &output_allocator) {
-  return map_tensor_accessors2(lhs, rhs, 
-                               DataType::BOOL,
-                               [](auto const &l, auto const &r) { return l < r; },
-                               output_allocator);
+GenericTensorAccessorW
+    compare_tensor_accessors_lt(GenericTensorAccessorR const &lhs,
+                                GenericTensorAccessorR const &rhs,
+                                Allocator &output_allocator) {
+  return map_tensor_accessors2(
+      lhs,
+      rhs,
+      DataType::BOOL,
+      [](auto const &l, auto const &r) { return l < r; },
+      output_allocator);
 }
 
-GenericTensorAccessorW compare_tensor_accessors_le(GenericTensorAccessorR const &lhs,
-                                                   GenericTensorAccessorR const &rhs,
-                                                   Allocator &output_allocator) {
-  return map_tensor_accessors2(lhs, rhs,
-                               DataType::BOOL,
-                               [](auto const &l, auto const &r) { return l <= r; },
-                               output_allocator);
+GenericTensorAccessorW
+    compare_tensor_accessors_le(GenericTensorAccessorR const &lhs,
+                                GenericTensorAccessorR const &rhs,
+                                Allocator &output_allocator) {
+  return map_tensor_accessors2(
+      lhs,
+      rhs,
+      DataType::BOOL,
+      [](auto const &l, auto const &r) { return l <= r; },
+      output_allocator);
 }
 
-
-GenericTensorAccessorW compare_tensor_accessors_gt(GenericTensorAccessorR const &lhs,
-                                                   GenericTensorAccessorR const &rhs,
-                                                   Allocator &output_allocator) {
-  return map_tensor_accessors2(lhs, rhs, 
-                               DataType::BOOL,
-                               [](auto const &l, auto const &r) { return l > r; },
-                               output_allocator);
+GenericTensorAccessorW
+    compare_tensor_accessors_gt(GenericTensorAccessorR const &lhs,
+                                GenericTensorAccessorR const &rhs,
+                                Allocator &output_allocator) {
+  return map_tensor_accessors2(
+      lhs,
+      rhs,
+      DataType::BOOL,
+      [](auto const &l, auto const &r) { return l > r; },
+      output_allocator);
 }
 
-GenericTensorAccessorW compare_tensor_accessors_ge(GenericTensorAccessorR const &lhs,
-                                                   GenericTensorAccessorR const &rhs,
-                                                   Allocator &output_allocator) {
-  return map_tensor_accessors2(lhs, rhs, 
-                               DataType::BOOL,
-                               [](auto const &l, auto const &r) { return l >= r; },
-                               output_allocator);
+GenericTensorAccessorW
+    compare_tensor_accessors_ge(GenericTensorAccessorR const &lhs,
+                                GenericTensorAccessorR const &rhs,
+                                Allocator &output_allocator) {
+  return map_tensor_accessors2(
+      lhs,
+      rhs,
+      DataType::BOOL,
+      [](auto const &l, auto const &r) { return l >= r; },
+      output_allocator);
 }
 
-GenericTensorAccessorW compare_tensor_accessors_eq(GenericTensorAccessorR const &lhs,
-                                                   GenericTensorAccessorR const &rhs,
-                                                   Allocator &output_allocator) {
-  return map_tensor_accessors2(lhs, rhs, 
-                               DataType::BOOL,
-                               [](auto const &l, auto const &r) { return l == r; },
-                               output_allocator);
+GenericTensorAccessorW
+    compare_tensor_accessors_eq(GenericTensorAccessorR const &lhs,
+                                GenericTensorAccessorR const &rhs,
+                                Allocator &output_allocator) {
+  return map_tensor_accessors2(
+      lhs,
+      rhs,
+      DataType::BOOL,
+      [](auto const &l, auto const &r) { return l == r; },
+      output_allocator);
 }
 
-
-GenericTensorAccessorW compare_tensor_accessors_ne(GenericTensorAccessorR const &lhs,
-                                                   GenericTensorAccessorR const &rhs,
-                                                   Allocator &output_allocator) {
-  return map_tensor_accessors2(lhs, rhs, 
-                               DataType::BOOL,
-                               [](auto const &l, auto const &r) { return l != r; },
-                               output_allocator);
+GenericTensorAccessorW
+    compare_tensor_accessors_ne(GenericTensorAccessorR const &lhs,
+                                GenericTensorAccessorR const &rhs,
+                                Allocator &output_allocator) {
+  return map_tensor_accessors2(
+      lhs,
+      rhs,
+      DataType::BOOL,
+      [](auto const &l, auto const &r) { return l != r; },
+      output_allocator);
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/create_accessor_with_contents.cc b/lib/kernels/src/kernels/create_accessor_with_contents.cc
index f8b85baa4a..32b61926bd 100644
--- a/lib/kernels/src/kernels/create_accessor_with_contents.cc
+++ b/lib/kernels/src/kernels/create_accessor_with_contents.cc
@@ -2,43 +2,32 @@
 
 namespace FlexFlow {
 
-template
-  GenericTensorAccessorW
-    create_1d_accessor_w_with_contents(std::vector<bool> const &,
+template GenericTensorAccessorW
+    create_1d_accessor_w_with_contents(std::vector<bool> const &, Allocator &);
+
+template GenericTensorAccessorW
+    create_2d_accessor_w_with_contents(std::vector<std::vector<bool>> const &,
                                        Allocator &);
 
-template
-  GenericTensorAccessorW create_2d_accessor_w_with_contents(
-      std::vector<std::vector<bool>> const &, Allocator &);
+template GenericTensorAccessorW create_3d_accessor_w_with_contents(
+    std::vector<std::vector<std::vector<bool>>> const &, Allocator &);
 
-template
-  GenericTensorAccessorW create_3d_accessor_w_with_contents(
-      std::vector<std::vector<std::vector<bool>>> const &,
-      Allocator &);
+template GenericTensorAccessorW create_4d_accessor_w_with_contents(
+    std::vector<std::vector<std::vector<std::vector<bool>>>> const &,
+    Allocator &);
 
-template
-  GenericTensorAccessorW create_4d_accessor_w_with_contents(
-      std::vector<std::vector<std::vector<std::vector<bool>>>> const &,
-      Allocator &);
+template GenericTensorAccessorR
+    create_1d_accessor_r_with_contents(std::vector<bool> const &, Allocator &);
 
-template
-  GenericTensorAccessorR
-    create_1d_accessor_r_with_contents(std::vector<bool> const &,
+template GenericTensorAccessorR
+    create_2d_accessor_r_with_contents(std::vector<std::vector<bool>> const &,
                                        Allocator &);
 
-template
-  GenericTensorAccessorR create_2d_accessor_r_with_contents(
-      std::vector<std::vector<bool>> const &, Allocator &);
-
-template
-  GenericTensorAccessorR create_3d_accessor_r_with_contents(
-      std::vector<std::vector<std::vector<bool>>> const &,
-      Allocator &);
-
-template
-  GenericTensorAccessorR create_4d_accessor_r_with_contents(
-      std::vector<std::vector<std::vector<std::vector<bool>>>> const &,
-      Allocator &);
+template GenericTensorAccessorR create_3d_accessor_r_with_contents(
+    std::vector<std::vector<std::vector<bool>>> const &, Allocator &);
 
+template GenericTensorAccessorR create_4d_accessor_r_with_contents(
+    std::vector<std::vector<std::vector<std::vector<bool>>>> const &,
+    Allocator &);
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/fill_tensor_accessor.cc b/lib/kernels/src/kernels/fill_tensor_accessor.cc
index f173bd0860..bee8d12556 100644
--- a/lib/kernels/src/kernels/fill_tensor_accessor.cc
+++ b/lib/kernels/src/kernels/fill_tensor_accessor.cc
@@ -6,21 +6,17 @@ namespace FlexFlow {
 void fill_tensor_accessor(GenericTensorAccessorW &accessor, DataTypeValue val) {
   ASSERT(accessor.device_type == DeviceType::CPU);
   ASSERT(accessor.data_type == get_data_type_of_data_type_value(val));
-
 }
 
-GenericTensorAccessorW create_accessor_w_filled_with(TensorShape const &shape,
-                                                     DataTypeValue val,
-                                                     Allocator const &allocator) {
+GenericTensorAccessorW create_accessor_w_filled_with(
+    TensorShape const &shape, DataTypeValue val, Allocator const &allocator) {
   NOT_IMPLEMENTED();
 }
 
-GenericTensorAccessorR create_accessor_r_filled_with(TensorShape const &shape,
-                                                     DataTypeValue val,
-                                                     Allocator const &allocator) {
+GenericTensorAccessorR create_accessor_r_filled_with(
+    TensorShape const &shape, DataTypeValue val, Allocator const &allocator) {
   return read_only_accessor_from_write_accessor(
-    create_accessor_w_filled_with(shape, val, allocator));
+      create_accessor_w_filled_with(shape, val, allocator));
 }
 
-
 } // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/format_accessor_contents.cc b/lib/kernels/src/kernels/format_accessor_contents.cc
index 3d24483967..ed54b21cfd 100644
--- a/lib/kernels/src/kernels/format_accessor_contents.cc
+++ b/lib/kernels/src/kernels/format_accessor_contents.cc
@@ -3,8 +3,8 @@
 #include "kernels/datatype_dispatch.h"
 #include "kernels/local_cpu_allocator.h"
 #include "utils/indent.h"
-#include <libassert/assert.hpp>
 #include "utils/nonnegative_int/nonnegative_range.h"
+#include <libassert/assert.hpp>
 
 namespace FlexFlow {
 
@@ -19,12 +19,12 @@ struct Print1DCPUAccessorR {
     positive_int ncols = accessor.shape.at(ff_dim_t{0_n});
 
     stream << "["
-           << join_strings(nonnegative_range(ncols.nonnegative_int_from_positive_int()),
-                           " ",
-                           [&](nonnegative_int col_idx) -> std::string {
-                             return fmt::to_string(
-                                 accessor.at<DT>(FFOrdered{col_idx}));
-                           })
+           << join_strings(
+                  nonnegative_range(ncols.nonnegative_int_from_positive_int()),
+                  " ",
+                  [&](nonnegative_int col_idx) -> std::string {
+                    return fmt::to_string(accessor.at<DT>(FFOrdered{col_idx}));
+                  })
            << "]";
   }
 };
@@ -51,7 +51,8 @@ struct Print2DCPUAccessorR {
 
     auto render_1d = [&](nonnegative_int dim0_idx) -> std::string {
       return "[" +
-             join_strings(nonnegative_range(dim1_size.nonnegative_int_from_positive_int()),
+             join_strings(nonnegative_range(
+                              dim1_size.nonnegative_int_from_positive_int()),
                           " ",
                           [&](nonnegative_int dim1_idx) -> std::string {
                             return fmt::to_string(
@@ -61,8 +62,11 @@ struct Print2DCPUAccessorR {
     };
 
     stream << "[\n"
-           << indent(
-                  join_strings(nonnegative_range(dim0_size.nonnegative_int_from_positive_int()), "\n", render_1d))
+           << indent(join_strings(
+                  nonnegative_range(
+                      dim0_size.nonnegative_int_from_positive_int()),
+                  "\n",
+                  render_1d))
            << "\n]";
   }
 };
@@ -92,7 +96,8 @@ struct Print3DCPUAccessorR {
     auto render_1d = [&](nonnegative_int dim0_idx,
                          nonnegative_int dim1_idx) -> std::string {
       return "[" +
-             join_strings(nonnegative_range(dim2_size.nonnegative_int_from_positive_int()),
+             join_strings(nonnegative_range(
+                              dim2_size.nonnegative_int_from_positive_int()),
                           " ",
                           [&](nonnegative_int dim2_idx) -> std::string {
                             return fmt::to_string(accessor.at<DT>(
@@ -103,17 +108,22 @@ struct Print3DCPUAccessorR {
 
     auto render_2d = [&](nonnegative_int dim0_idx) -> std::string {
       return "[\n" +
-             indent(join_strings(nonnegative_range(dim1_size.nonnegative_int_from_positive_int()),
-                                 "\n",
-                                 [&](nonnegative_int dim1_idx) -> std::string {
-                                   return render_1d(dim0_idx, dim1_idx);
-                                 })) +
+             indent(join_strings(
+                 nonnegative_range(
+                     dim1_size.nonnegative_int_from_positive_int()),
+                 "\n",
+                 [&](nonnegative_int dim1_idx) -> std::string {
+                   return render_1d(dim0_idx, dim1_idx);
+                 })) +
              "\n]";
     };
 
     stream << "[\n"
-           << indent(
-                  join_strings(nonnegative_range(dim0_size.nonnegative_int_from_positive_int()), "\n", render_2d))
+           << indent(join_strings(
+                  nonnegative_range(
+                      dim0_size.nonnegative_int_from_positive_int()),
+                  "\n",
+                  render_2d))
            << "\n]";
   }
 };
diff --git a/lib/kernels/src/kernels/map_tensor_accessors.cc b/lib/kernels/src/kernels/map_tensor_accessors.cc
index c59d2207d0..77200fcefb 100644
--- a/lib/kernels/src/kernels/map_tensor_accessors.cc
+++ b/lib/kernels/src/kernels/map_tensor_accessors.cc
@@ -4,24 +4,26 @@ namespace FlexFlow {
 
 struct F1 {
   template <typename T>
-  float operator()(T const &t) const { NOT_IMPLEMENTED(); }
+  float operator()(T const &t) const {
+    NOT_IMPLEMENTED();
+  }
 };
 
-template
-GenericTensorAccessorW map_tensor_accessor(GenericTensorAccessorR const &,
-                                           F1 &&,
-                                           Allocator &);
+template GenericTensorAccessorW
+    map_tensor_accessor(GenericTensorAccessorR const &, F1 &&, Allocator &);
 
 struct F2 {
   template <typename T1, typename T2>
-  float operator()(T1 const &lhs, T2 const &rhs) const { NOT_IMPLEMENTED(); }
+  float operator()(T1 const &lhs, T2 const &rhs) const {
+    NOT_IMPLEMENTED();
+  }
 };
 
-template
-  GenericTensorAccessorW map_tensor_accessors2(GenericTensorAccessorR const &,
-                                               GenericTensorAccessorR const &,
-                                               DataType,
-                                               F2 &&,
-                                               Allocator &);
+template GenericTensorAccessorW
+    map_tensor_accessors2(GenericTensorAccessorR const &,
+                          GenericTensorAccessorR const &,
+                          DataType,
+                          F2 &&,
+                          Allocator &);
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/reduce_tensor_accessor.cc b/lib/kernels/src/kernels/reduce_tensor_accessor.cc
index b9c4cee085..b51306d0e8 100644
--- a/lib/kernels/src/kernels/reduce_tensor_accessor.cc
+++ b/lib/kernels/src/kernels/reduce_tensor_accessor.cc
@@ -4,14 +4,13 @@ namespace FlexFlow {
 
 using F = std::function<int32_t(int32_t, float)>;
 
-template
-  GenericTensorAccessorW reduce_tensor_accessor_in_dims(
-    GenericTensorAccessorR const &,
-    std::unordered_set<ff_dim_t> const &,
-    Allocator &,
-    F &&);
+template GenericTensorAccessorW
+    reduce_tensor_accessor_in_dims(GenericTensorAccessorR const &,
+                                   std::unordered_set<ff_dim_t> const &,
+                                   Allocator &,
+                                   F &&);
 
-template
-  int32_t reduce_tensor_accessor_in_all_dims<DataType::INT32>(GenericTensorAccessorR const &, F &&);
+template int32_t reduce_tensor_accessor_in_all_dims<DataType::INT32>(
+    GenericTensorAccessorR const &, F &&);
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/tensor_accessor_reductions.cc b/lib/kernels/src/kernels/tensor_accessor_reductions.cc
index baeb9fadc1..b11791d32c 100644
--- a/lib/kernels/src/kernels/tensor_accessor_reductions.cc
+++ b/lib/kernels/src/kernels/tensor_accessor_reductions.cc
@@ -8,20 +8,22 @@ bool tensor_accessor_all(GenericTensorAccessorR const &t) {
   ASSERT(t.data_type == DataType::BOOL);
 
   return reduce_tensor_accessor_in_all_dims<DataType::BOOL>(
-    t, overload {
-      [](bool lhs, bool rhs) -> bool { return lhs && rhs; },
-      [](auto lhs, auto rhs) -> bool { PANIC(); },
-    });
+      t,
+      overload{
+          [](bool lhs, bool rhs) -> bool { return lhs && rhs; },
+          [](auto lhs, auto rhs) -> bool { PANIC(); },
+      });
 }
 
 bool tensor_accessor_any(GenericTensorAccessorR const &t) {
   ASSERT(t.data_type == DataType::BOOL);
 
   return reduce_tensor_accessor_in_all_dims<DataType::BOOL>(
-    t, overload {
-      [](bool lhs, bool rhs) -> bool { return lhs || rhs; },
-      [](auto lhs, auto rhs) -> bool { PANIC(); },
-    });
+      t,
+      overload{
+          [](bool lhs, bool rhs) -> bool { return lhs || rhs; },
+          [](auto lhs, auto rhs) -> bool { PANIC(); },
+      });
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc
index 7c619bb557..305a6c935c 100644
--- a/lib/kernels/src/managed_per_device_ff_handle.cc
+++ b/lib/kernels/src/managed_per_device_ff_handle.cc
@@ -5,7 +5,10 @@
 namespace FlexFlow {
 
 ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle(
-    int num_ranks, int my_rank, size_t workSpaceSize, bool allowTensorOpMathConversion) {
+    int num_ranks,
+    int my_rank,
+    size_t workSpaceSize,
+    bool allowTensorOpMathConversion) {
   this->handle = new PerDeviceFFHandle{};
   this->handle->workSpaceSize = workSpaceSize;
   this->handle->allowTensorOpMathConversion = allowTensorOpMathConversion;
@@ -48,24 +51,27 @@ PerDeviceFFHandle const &ManagedPerDeviceFFHandle::raw_handle() const {
   return *handle;
 }
 
-ManagedPerDeviceFFHandle initialize_single_gpu_handle(size_t workSpaceSize, bool allowTensorOpMathConversion) {
+ManagedPerDeviceFFHandle
+    initialize_single_gpu_handle(size_t workSpaceSize,
+                                 bool allowTensorOpMathConversion) {
   return ManagedPerDeviceFFHandle{
-    /*num_ranks=*/1, 
-    /*my_rank=*/0,
-    /*workSpaceSize=*/workSpaceSize,
-    /*allowTensorOpMathConversion=*/allowTensorOpMathConversion,
+      /*num_ranks=*/1,
+      /*my_rank=*/0,
+      /*workSpaceSize=*/workSpaceSize,
+      /*allowTensorOpMathConversion=*/allowTensorOpMathConversion,
   };
 }
 
-ManagedPerDeviceFFHandle initialize_multi_gpu_handle(int num_ranks,
-                                                     int my_rank,
-                                                     size_t workSpaceSize,
-                                                     bool allowTensorOpMathConversion) {
+ManagedPerDeviceFFHandle
+    initialize_multi_gpu_handle(int num_ranks,
+                                int my_rank,
+                                size_t workSpaceSize,
+                                bool allowTensorOpMathConversion) {
   return ManagedPerDeviceFFHandle{
-    /*num_ranks=*/num_ranks, 
-    /*my_rank=*/my_rank,
-    /*workSpaceSize=*/workSpaceSize,
-    /*allowTensorOpMathConversion=*/allowTensorOpMathConversion,
+      /*num_ranks=*/num_ranks,
+      /*my_rank=*/my_rank,
+      /*workSpaceSize=*/workSpaceSize,
+      /*allowTensorOpMathConversion=*/allowTensorOpMathConversion,
   };
 }
 
diff --git a/lib/kernels/test/src/cpu/ops/replicate_kernels.cc b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc
index be1e3832ff..b98b1745d5 100644
--- a/lib/kernels/test/src/cpu/ops/replicate_kernels.cc
+++ b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc
@@ -41,8 +41,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         },
         cpu_allocator);
 
-    GenericTensorAccessorR correct = create_1d_accessor_r_with_contents<int32_t>(
-        {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator);
+    GenericTensorAccessorR correct =
+        create_1d_accessor_r_with_contents<int32_t>(
+            {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator);
 
     TensorShape result_shape = TensorShape{
         TensorDims{FFOrdered{3_p}},
diff --git a/lib/kernels/test/src/cpu/ops/reverse_kernels.cc b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc
index 9e0f38c8d6..51025cd17b 100644
--- a/lib/kernels/test/src/cpu/ops/reverse_kernels.cc
+++ b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc
@@ -1,9 +1,9 @@
 #include "internal/test_utils.h"
+#include "kernels/create_accessor_with_contents.h"
 #include "kernels/format_accessor_contents.h"
 #include "kernels/reverse_kernels_cpu.h"
-#include "kernels/create_accessor_with_contents.h"
-#include <doctest/doctest.h>
 #include "test/utils/doctest/check_kv.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
@@ -36,18 +36,19 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*axis=*/ff_dim_t{0_n},
       };
 
-      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents<int32_t>(
-          {
+      GenericTensorAccessorR correct =
+          create_3d_accessor_r_with_contents<int32_t>(
               {
-                  {3, 3, 6},
-                  {2, 1, 5},
+                  {
+                      {3, 3, 6},
+                      {2, 1, 5},
+                  },
+                  {
+                      {1, 3, 2},
+                      {4, 2, 1},
+                  },
               },
-              {
-                  {1, 3, 2},
-                  {4, 2, 1},
-              },
-          },
-          cpu_allocator);
+              cpu_allocator);
 
       Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
 
@@ -60,18 +61,19 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*axis=*/ff_dim_t{1_n},
       };
 
-      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents<int32_t>(
-          {
-              {
-                  {4, 2, 1},
-                  {1, 3, 2},
-              },
+      GenericTensorAccessorR correct =
+          create_3d_accessor_r_with_contents<int32_t>(
               {
-                  {2, 1, 5},
-                  {3, 3, 6},
+                  {
+                      {4, 2, 1},
+                      {1, 3, 2},
+                  },
+                  {
+                      {2, 1, 5},
+                      {3, 3, 6},
+                  },
               },
-          },
-          cpu_allocator);
+              cpu_allocator);
 
       Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
 
@@ -84,18 +86,19 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*axis=*/ff_dim_t{2_n},
       };
 
-      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents<int32_t>(
-          {
+      GenericTensorAccessorR correct =
+          create_3d_accessor_r_with_contents<int32_t>(
               {
-                  {2, 3, 1},
-                  {1, 2, 4},
+                  {
+                      {2, 3, 1},
+                      {1, 2, 4},
+                  },
+                  {
+                      {6, 3, 3},
+                      {5, 1, 2},
+                  },
               },
-              {
-                  {6, 3, 3},
-                  {5, 1, 2},
-              },
-          },
-          cpu_allocator);
+              cpu_allocator);
 
       Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
 
@@ -132,18 +135,19 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*axis=*/ff_dim_t{0_n},
       };
 
-      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents<int32_t>(
-          {
-              {
-                  {3, 3, 6},
-                  {2, 1, 5},
-              },
+      GenericTensorAccessorR correct =
+          create_3d_accessor_r_with_contents<int32_t>(
               {
-                  {1, 3, 2},
-                  {4, 2, 1},
+                  {
+                      {3, 3, 6},
+                      {2, 1, 5},
+                  },
+                  {
+                      {1, 3, 2},
+                      {4, 2, 1},
+                  },
               },
-          },
-          cpu_allocator);
+              cpu_allocator);
 
       Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
 
@@ -156,18 +160,19 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*axis=*/ff_dim_t{1_n},
       };
 
-      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents<int32_t>(
-          {
+      GenericTensorAccessorR correct =
+          create_3d_accessor_r_with_contents<int32_t>(
               {
-                  {4, 2, 1},
-                  {1, 3, 2},
+                  {
+                      {4, 2, 1},
+                      {1, 3, 2},
+                  },
+                  {
+                      {2, 1, 5},
+                      {3, 3, 6},
+                  },
               },
-              {
-                  {2, 1, 5},
-                  {3, 3, 6},
-              },
-          },
-          cpu_allocator);
+              cpu_allocator);
 
       Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
 
@@ -180,18 +185,19 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*axis=*/ff_dim_t{2_n},
       };
 
-      GenericTensorAccessorR correct = create_3d_accessor_r_with_contents<int32_t>(
-          {
-              {
-                  {2, 3, 1},
-                  {1, 2, 4},
-              },
+      GenericTensorAccessorR correct =
+          create_3d_accessor_r_with_contents<int32_t>(
               {
-                  {6, 3, 3},
-                  {5, 1, 2},
+                  {
+                      {2, 3, 1},
+                      {1, 2, 4},
+                  },
+                  {
+                      {6, 3, 3},
+                      {5, 1, 2},
+                  },
               },
-          },
-          cpu_allocator);
+              cpu_allocator);
 
       Kernels::Reverse::cpu_forward_kernel(input, result, attrs);
 
diff --git a/lib/kernels/test/src/internal/test_utils.cc b/lib/kernels/test/src/internal/test_utils.cc
index 1d08adb56a..a9ba8dea13 100644
--- a/lib/kernels/test/src/internal/test_utils.cc
+++ b/lib/kernels/test/src/internal/test_utils.cc
@@ -20,7 +20,6 @@ GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape,
   return read_only_accessor_from_write_accessor(accessor);
 }
 
-
 template <DataType DT>
 struct CreateRandomFilledAccessorW {
   GenericTensorAccessorW operator()(TensorShape const &shape,
@@ -82,10 +81,10 @@ struct FillWithZeros {
              0,
              accessor.shape.num_elements().int_from_positive_int() * sizeof(T));
     } else {
-      checkCUDA(cudaMemset(accessor.ptr,
-                           0,
-                           accessor.shape.num_elements().int_from_positive_int() *
-                               sizeof(T)));
+      checkCUDA(cudaMemset(
+          accessor.ptr,
+          0,
+          accessor.shape.num_elements().int_from_positive_int() * sizeof(T)));
     }
   }
 };
diff --git a/lib/kernels/test/src/kernels/accessor.cc b/lib/kernels/test/src/kernels/accessor.cc
index 2f7e908e0b..45e83cc0c6 100644
--- a/lib/kernels/test/src/kernels/accessor.cc
+++ b/lib/kernels/test/src/kernels/accessor.cc
@@ -1,8 +1,8 @@
 #include "kernels/accessor.h"
 #include "internal/test_utils.h"
+#include "kernels/create_accessor_with_contents.h"
 #include "kernels/local_cpu_allocator.h"
 #include <doctest/doctest.h>
-#include "kernels/create_accessor_with_contents.h"
 
 using namespace ::FlexFlow;
 
@@ -78,25 +78,25 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("returns the value if the accessor only contains one value") {
       GenericTensorAccessorR input = create_3d_accessor_r_with_contents<float>(
           {
-            {
-              {12},
-            },
+              {
+                  {12},
+              },
           },
           cpu_allocator);
 
       float result = accessor_get_only_value<DataType::FLOAT>(input);
-      float correct = 12; 
+      float correct = 12;
 
       CHECK(result == correct);
     }
 
-
-    SUBCASE("throws an error if the requested type does not match the type in the accessor") {
+    SUBCASE("throws an error if the requested type does not match the type in "
+            "the accessor") {
       GenericTensorAccessorR input = create_3d_accessor_r_with_contents<float>(
           {
-            {
-              {12},
-            },
+              {
+                  {12},
+              },
           },
           cpu_allocator);
 
@@ -106,10 +106,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("throws an error if the accessor contains multiple values") {
       GenericTensorAccessorR input = create_3d_accessor_r_with_contents<float>(
           {
-            {
-              {12},
-              {12},
-            },
+              {
+                  {12},
+                  {12},
+              },
           },
           cpu_allocator);
 
diff --git a/lib/kernels/test/src/kernels/array_coord.cc b/lib/kernels/test/src/kernels/array_coord.cc
index 128b746a87..bbb503caf1 100644
--- a/lib/kernels/test/src/kernels/array_coord.cc
+++ b/lib/kernels/test/src/kernels/array_coord.cc
@@ -1,29 +1,32 @@
-#include <doctest/doctest.h>
 #include "kernels/array_coord.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("array_coord_drop_dims") {
     ArrayCoord coord = ArrayCoord{
-      FFOrdered{3_n, 5_n, 0_n, 1_n},
+        FFOrdered{3_n, 5_n, 0_n, 1_n},
     };
 
     SUBCASE("removes dims specified to be dropped") {
-      std::function<bool(ff_dim_t)> should_drop_dim 
-        = [](ff_dim_t d) { return d.value % 2_n == 0_n; };
+      std::function<bool(ff_dim_t)> should_drop_dim = [](ff_dim_t d) {
+        return d.value % 2_n == 0_n;
+      };
 
       ArrayCoord result = array_coord_drop_dims(coord, should_drop_dim);
       ArrayCoord correct = ArrayCoord{
-        FFOrdered{5_n, 1_n},
+          FFOrdered{5_n, 1_n},
       };
 
       CHECK(result == correct);
     }
 
-    SUBCASE("is identity function if no dimensions are specified to be dropped") {
-      std::function<bool(ff_dim_t)> should_drop_dim 
-        = [](ff_dim_t d) { return false; };
+    SUBCASE(
+        "is identity function if no dimensions are specified to be dropped") {
+      std::function<bool(ff_dim_t)> should_drop_dim = [](ff_dim_t d) {
+        return false;
+      };
 
       ArrayCoord result = array_coord_drop_dims(coord, should_drop_dim);
       ArrayCoord correct = coord;
@@ -31,9 +34,11 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(result == correct);
     }
 
-    SUBCASE("returns empty coord if all dimensions are specified to be dropped") {
-      std::function<bool(ff_dim_t)> should_drop_dim 
-        = [](ff_dim_t d) { return true; };
+    SUBCASE(
+        "returns empty coord if all dimensions are specified to be dropped") {
+      std::function<bool(ff_dim_t)> should_drop_dim = [](ff_dim_t d) {
+        return true;
+      };
 
       ArrayCoord result = array_coord_drop_dims(coord, should_drop_dim);
       ArrayCoord correct = ArrayCoord{FFOrdered<nonnegative_int>{}};
diff --git a/lib/kernels/test/src/kernels/array_shape.cc b/lib/kernels/test/src/kernels/array_shape.cc
index 2665cdda36..b3ccbc688c 100644
--- a/lib/kernels/test/src/kernels/array_shape.cc
+++ b/lib/kernels/test/src/kernels/array_shape.cc
@@ -38,7 +38,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
   TEST_CASE("array_shape_drop_dims") {
     ArrayShape input = ArrayShape{
-      LegionOrdered{2_p, 4_p, 3_p},
+        LegionOrdered{2_p, 4_p, 3_p},
     };
 
     SUBCASE("removes dims specified to be dropped") {
@@ -48,16 +48,15 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       ArrayShape result = array_shape_drop_dims(input, should_drop_dim);
       ArrayShape correct = ArrayShape{
-        LegionOrdered{4_p},
+          LegionOrdered{4_p},
       };
 
       CHECK(result == correct);
     }
 
-    SUBCASE("is identity function if no dimensions are specified to be dropped") {
-      auto should_drop_dim = [](ff_dim_t dim) -> bool {
-        return false;
-      };
+    SUBCASE(
+        "is identity function if no dimensions are specified to be dropped") {
+      auto should_drop_dim = [](ff_dim_t dim) -> bool { return false; };
 
       ArrayShape result = array_shape_drop_dims(input, should_drop_dim);
       ArrayShape correct = input;
@@ -65,10 +64,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(result == correct);
     }
 
-    SUBCASE("is identity function if no dimensions are specified to be dropped") {
-      auto should_drop_dim = [](ff_dim_t dim) -> bool {
-        return false;
-      };
+    SUBCASE(
+        "is identity function if no dimensions are specified to be dropped") {
+      auto should_drop_dim = [](ff_dim_t dim) -> bool { return false; };
 
       ArrayShape result = array_shape_drop_dims(input, should_drop_dim);
       ArrayShape correct = input;
@@ -76,10 +74,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(result == correct);
     }
 
-    SUBCASE("returns empty shape if all dimensions are specified to be dropped") {
-      auto should_drop_dim = [](ff_dim_t dim) -> bool {
-        return true;
-      };
+    SUBCASE(
+        "returns empty shape if all dimensions are specified to be dropped") {
+      auto should_drop_dim = [](ff_dim_t dim) -> bool { return true; };
 
       ArrayShape result = array_shape_drop_dims(input, should_drop_dim);
       ArrayShape correct = ArrayShape{LegionOrdered<positive_int>{}};
diff --git a/lib/kernels/test/src/kernels/compare_tensor_accessors.cc b/lib/kernels/test/src/kernels/compare_tensor_accessors.cc
index 54706ad74e..85ffa91315 100644
--- a/lib/kernels/test/src/kernels/compare_tensor_accessors.cc
+++ b/lib/kernels/test/src/kernels/compare_tensor_accessors.cc
@@ -1,9 +1,9 @@
-#include "internal/test_utils.h"
-#include <doctest/doctest.h>
 #include "kernels/compare_tensor_accessors.h"
+#include "internal/test_utils.h"
 #include "kernels/create_accessor_with_contents.h"
 #include "kernels/format_accessor_contents.h"
 #include "test/utils/doctest/check_kv.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
@@ -13,41 +13,42 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     GenericTensorAccessorR lhs = create_3d_accessor_r_with_contents<float>(
         {
-          {
-            {1, 3, 2},
-            {4, 2, 1},
-          },
-          {
-            {3, 3, 6},
-            {2, 1, 5},
-          },
+            {
+                {1, 3, 2},
+                {4, 2, 1},
+            },
+            {
+                {3, 3, 6},
+                {2, 1, 5},
+            },
         },
         cpu_allocator);
 
     GenericTensorAccessorR rhs = create_3d_accessor_r_with_contents<float>(
         {
-          {
-            {2, 3, 3},
-            {5, 1, 0},
-          },
-          {
-            {1, 5, 4},
-            {2, 1, 5},
-          },
+            {
+                {2, 3, 3},
+                {5, 1, 0},
+            },
+            {
+                {1, 5, 4},
+                {2, 1, 5},
+            },
         },
         cpu_allocator);
 
-    GenericTensorAccessorW result = compare_tensor_accessors_lt(lhs, rhs, cpu_allocator);
+    GenericTensorAccessorW result =
+        compare_tensor_accessors_lt(lhs, rhs, cpu_allocator);
     GenericTensorAccessorR correct = create_3d_accessor_r_with_contents<bool>(
         {
-          {
-            {true, false, true},
-            {true, false, false},
-          },
-          {
-            {false, true, false},
-            {false, false, false},
-          },
+            {
+                {true, false, true},
+                {true, false, false},
+            },
+            {
+                {false, true, false},
+                {false, false, false},
+            },
         },
         cpu_allocator);
 
@@ -60,35 +61,36 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     GenericTensorAccessorR lhs = create_3d_accessor_r_with_contents<float>(
         {
-          {
-            {4, 2, 1},
-          },
-          {
-            {2, 1, 5},
-          },
+            {
+                {4, 2, 1},
+            },
+            {
+                {2, 1, 5},
+            },
         },
         cpu_allocator);
 
     GenericTensorAccessorR rhs = create_3d_accessor_r_with_contents<float>(
         {
-          {
-            {5, 1, 0},
-          },
-          {
-            {2, 1, 5},
-          },
+            {
+                {5, 1, 0},
+            },
+            {
+                {2, 1, 5},
+            },
         },
         cpu_allocator);
 
-    GenericTensorAccessorW result = compare_tensor_accessors_le(lhs, rhs, cpu_allocator);
+    GenericTensorAccessorW result =
+        compare_tensor_accessors_le(lhs, rhs, cpu_allocator);
     GenericTensorAccessorR correct = create_3d_accessor_r_with_contents<bool>(
         {
-          {
-            {true, false, false},
-          },
-          {
-            {true, true, true},
-          },
+            {
+                {true, false, false},
+            },
+            {
+                {true, true, true},
+            },
         },
         cpu_allocator);
 
@@ -101,23 +103,24 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     GenericTensorAccessorR lhs = create_2d_accessor_r_with_contents<float>(
         {
-          {4, 2, 1},
-          {2, 1, 5},
+            {4, 2, 1},
+            {2, 1, 5},
         },
         cpu_allocator);
 
     GenericTensorAccessorR rhs = create_2d_accessor_r_with_contents<float>(
         {
-          {5, 1, 0},
-          {2, 1, 5},
+            {5, 1, 0},
+            {2, 1, 5},
         },
         cpu_allocator);
 
-    GenericTensorAccessorW result = compare_tensor_accessors_gt(lhs, rhs, cpu_allocator);
+    GenericTensorAccessorW result =
+        compare_tensor_accessors_gt(lhs, rhs, cpu_allocator);
     GenericTensorAccessorR correct = create_2d_accessor_r_with_contents<bool>(
         {
-          {false, true, true},
-          {false, false, false},
+            {false, true, true},
+            {false, false, false},
         },
         cpu_allocator);
 
@@ -130,26 +133,27 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     GenericTensorAccessorR lhs = create_2d_accessor_r_with_contents<float>(
         {
-          {4, 2},
-          {2, 5},
-          {1, 8},
+            {4, 2},
+            {2, 5},
+            {1, 8},
         },
         cpu_allocator);
 
     GenericTensorAccessorR rhs = create_2d_accessor_r_with_contents<float>(
         {
-          {5, 1},
-          {3, 6},
-          {1, 0},
+            {5, 1},
+            {3, 6},
+            {1, 0},
         },
         cpu_allocator);
 
-    GenericTensorAccessorW result = compare_tensor_accessors_ge(lhs, rhs, cpu_allocator);
+    GenericTensorAccessorW result =
+        compare_tensor_accessors_ge(lhs, rhs, cpu_allocator);
     GenericTensorAccessorR correct = create_2d_accessor_r_with_contents<bool>(
         {
-          {false, true},
-          {false, false},
-          {true, true},
+            {false, true},
+            {false, false},
+            {true, true},
         },
         cpu_allocator);
 
@@ -162,23 +166,24 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     GenericTensorAccessorR lhs = create_2d_accessor_r_with_contents<float>(
         {
-          {4, 2},
-          {1, 8},
+            {4, 2},
+            {1, 8},
         },
         cpu_allocator);
 
     GenericTensorAccessorR rhs = create_2d_accessor_r_with_contents<float>(
         {
-          {5, 2},
-          {1, 8},
+            {5, 2},
+            {1, 8},
         },
         cpu_allocator);
 
-    GenericTensorAccessorW result = compare_tensor_accessors_eq(lhs, rhs, cpu_allocator);
+    GenericTensorAccessorW result =
+        compare_tensor_accessors_eq(lhs, rhs, cpu_allocator);
     GenericTensorAccessorR correct = create_2d_accessor_r_with_contents<bool>(
         {
-          {false, true},
-          {true, true},
+            {false, true},
+            {true, true},
         },
         cpu_allocator);
 
@@ -191,26 +196,27 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     GenericTensorAccessorR lhs = create_2d_accessor_r_with_contents<float>(
         {
-          {4, 2},
-          {1, 8},
-          {1, 2},
+            {4, 2},
+            {1, 8},
+            {1, 2},
         },
         cpu_allocator);
 
     GenericTensorAccessorR rhs = create_2d_accessor_r_with_contents<float>(
         {
-          {5, 2},
-          {1, 8},
-          {2, 2},
+            {5, 2},
+            {1, 8},
+            {2, 2},
         },
         cpu_allocator);
 
-    GenericTensorAccessorW result = compare_tensor_accessors_ne(lhs, rhs, cpu_allocator);
+    GenericTensorAccessorW result =
+        compare_tensor_accessors_ne(lhs, rhs, cpu_allocator);
     GenericTensorAccessorR correct = create_2d_accessor_r_with_contents<bool>(
         {
-          {true, false},
-          {false, false},
-          {true, false},
+            {true, false},
+            {false, false},
+            {true, false},
         },
         cpu_allocator);
 
diff --git a/lib/kernels/test/src/kernels/create_accessor_with_contents.cc b/lib/kernels/test/src/kernels/create_accessor_with_contents.cc
index a6cfdbc97f..69fa2728bf 100644
--- a/lib/kernels/test/src/kernels/create_accessor_with_contents.cc
+++ b/lib/kernels/test/src/kernels/create_accessor_with_contents.cc
@@ -1,5 +1,5 @@
-#include <doctest/doctest.h>
 #include "kernels/create_accessor_with_contents.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
@@ -7,8 +7,8 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("create_1d_accessor_w_with_contents") {
     Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
-    GenericTensorAccessorW result 
-      = create_1d_accessor_w_with_contents<float>({1, 4, 1, 2}, cpu_allocator);
+    GenericTensorAccessorW result =
+        create_1d_accessor_w_with_contents<float>({1, 4, 1, 2}, cpu_allocator);
 
     auto at = [&](nonnegative_int c) -> float {
       return result.at<DataType::FLOAT>(FFOrdered<nonnegative_int>{c});
@@ -23,13 +23,12 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("create_2d_accessor_w_with_contents") {
     Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
-    GenericTensorAccessorW result 
-      = create_2d_accessor_w_with_contents<float>(
-          {
-            {1, 4, 2}, 
-            {2, 2, 7}, 
-          },
-          cpu_allocator);
+    GenericTensorAccessorW result = create_2d_accessor_w_with_contents<float>(
+        {
+            {1, 4, 2},
+            {2, 2, 7},
+        },
+        cpu_allocator);
 
     auto at = [&](nonnegative_int r, nonnegative_int c) -> float {
       return result.at<DataType::FLOAT>(FFOrdered{r, c});
@@ -46,23 +45,23 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("create_3d_accessor_w_with_contents") {
     Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
-    GenericTensorAccessorW result 
-      = create_3d_accessor_w_with_contents<float>(
-          {
+    GenericTensorAccessorW result = create_3d_accessor_w_with_contents<float>(
+        {
             {
-              {1, 4}, 
-              {2, 3}, 
-              {7, 2}, 
+                {1, 4},
+                {2, 3},
+                {7, 2},
             },
             {
-              {9, 3}, 
-              {4, 5}, 
-              {0, 2}, 
+                {9, 3},
+                {4, 5},
+                {0, 2},
             },
-          },
-          cpu_allocator);
+        },
+        cpu_allocator);
 
-    auto at = [&](nonnegative_int s, nonnegative_int r, nonnegative_int c) -> float {
+    auto at =
+        [&](nonnegative_int s, nonnegative_int r, nonnegative_int c) -> float {
       return result.at<DataType::FLOAT>(FFOrdered{s, r, c});
     };
 
@@ -83,33 +82,35 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("create_4d_accessor_w_with_contents") {
     Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
-    GenericTensorAccessorW result 
-      = create_4d_accessor_w_with_contents<float>(
-          {
+    GenericTensorAccessorW result = create_4d_accessor_w_with_contents<float>(
+        {
             {
-              {
-                {2, 3}, 
-                {7, 2}, 
-              },
-              {
-                {4, 5}, 
-                {0, 2}, 
-              },
+                {
+                    {2, 3},
+                    {7, 2},
+                },
+                {
+                    {4, 5},
+                    {0, 2},
+                },
             },
             {
-              {
-                {9, 6}, 
-                {1, 2}, 
-              },
-              {
-                {8, 7}, 
-                {3, 8}, 
-              },
+                {
+                    {9, 6},
+                    {1, 2},
+                },
+                {
+                    {8, 7},
+                    {3, 8},
+                },
             },
-          },
-          cpu_allocator);
+        },
+        cpu_allocator);
 
-    auto at = [&](nonnegative_int s1, nonnegative_int s2, nonnegative_int r, nonnegative_int c) -> float {
+    auto at = [&](nonnegative_int s1,
+                  nonnegative_int s2,
+                  nonnegative_int r,
+                  nonnegative_int c) -> float {
       return result.at<DataType::FLOAT>(FFOrdered{s1, s2, r, c});
     };
 
diff --git a/lib/kernels/test/src/kernels/format_accessor_contents.cc b/lib/kernels/test/src/kernels/format_accessor_contents.cc
index a7f2bed5ba..f515f2495b 100644
--- a/lib/kernels/test/src/kernels/format_accessor_contents.cc
+++ b/lib/kernels/test/src/kernels/format_accessor_contents.cc
@@ -1,7 +1,7 @@
 #include "kernels/format_accessor_contents.h"
 #include "internal/test_utils.h"
-#include "kernels/local_cpu_allocator.h"
 #include "kernels/create_accessor_with_contents.h"
+#include "kernels/local_cpu_allocator.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
@@ -12,7 +12,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("accessor is 1d") {
       GenericTensorAccessorR accessor =
-          create_1d_accessor_r_with_contents<int32_t>({1, 2, 3, 2}, cpu_allocator);
+          create_1d_accessor_r_with_contents<int32_t>({1, 2, 3, 2},
+                                                      cpu_allocator);
 
       std::string correct = "[1 2 3 2]";
 
@@ -22,13 +23,14 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("accessor is 2d") {
-      GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents<int32_t>(
-          {
-              {1, 2, 3, 5},
-              {4, 3, 3, 2},
-              {1, 1, 5, 8},
-          },
-          cpu_allocator);
+      GenericTensorAccessorR accessor =
+          create_2d_accessor_r_with_contents<int32_t>(
+              {
+                  {1, 2, 3, 5},
+                  {4, 3, 3, 2},
+                  {1, 1, 5, 8},
+              },
+              cpu_allocator);
 
       std::string correct = "[\n"
                             "  [1 2 3 5]\n"
@@ -42,25 +44,26 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("accessor is 3d") {
-      GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents<int32_t>(
-          {
-              {
-                  {1, 2, 3, 6},
-                  {4, 3, 3, 9},
-                  {1, 1, 5, 1},
-              },
-              {
-                  {4, 1, 8, 7},
-                  {9, 4, 2, 4},
-                  {1, 0, 0, 6},
-              },
+      GenericTensorAccessorR accessor =
+          create_3d_accessor_r_with_contents<int32_t>(
               {
-                  {2, 1, 1, 9},
-                  {1, 3, 6, 2},
-                  {1, 9, 8, 9},
+                  {
+                      {1, 2, 3, 6},
+                      {4, 3, 3, 9},
+                      {1, 1, 5, 1},
+                  },
+                  {
+                      {4, 1, 8, 7},
+                      {9, 4, 2, 4},
+                      {1, 0, 0, 6},
+                  },
+                  {
+                      {2, 1, 1, 9},
+                      {1, 3, 6, 2},
+                      {1, 9, 8, 9},
+                  },
               },
-          },
-          cpu_allocator);
+              cpu_allocator);
 
       std::string correct = "[\n"
                             "  [\n"
diff --git a/lib/kernels/test/src/kernels/map_tensor_accessors.cc b/lib/kernels/test/src/kernels/map_tensor_accessors.cc
index fcc59b7935..60d7c76904 100644
--- a/lib/kernels/test/src/kernels/map_tensor_accessors.cc
+++ b/lib/kernels/test/src/kernels/map_tensor_accessors.cc
@@ -1,6 +1,6 @@
-#include <doctest/doctest.h>
 #include "kernels/map_tensor_accessors.h"
 #include "kernels/create_accessor_with_contents.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
@@ -10,8 +10,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     GenericTensorAccessorW accessor = create_2d_accessor_w_with_contents<float>(
         {
-          {1, 3, 2},
-          {2, 1, 5},
+            {1, 3, 2},
+            {2, 1, 5},
         },
         cpu_allocator);
 
@@ -28,19 +28,20 @@ TEST_SUITE(FF_TEST_SUITE) {
     CHECK(at(1_n, 1_n) == 2);
     CHECK(at(1_n, 2_n) == 6);
   }
-  
+
   TEST_CASE("map_tensor_accessor") {
     Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
     GenericTensorAccessorW input = create_2d_accessor_w_with_contents<float>(
         {
-          {1, 3, 2},
-          {2, 1, 5},
+            {1, 3, 2},
+            {2, 1, 5},
         },
         cpu_allocator);
 
     SUBCASE("function is not type changing") {
-      GenericTensorAccessorW result = map_tensor_accessor(input, [](float x) { return x + 1; }, cpu_allocator);
+      GenericTensorAccessorW result = map_tensor_accessor(
+          input, [](float x) { return x + 1; }, cpu_allocator);
 
       auto at = [&](nonnegative_int r, nonnegative_int c) -> float {
         return result.at<DataType::FLOAT>(FFOrdered{r, c});
@@ -55,7 +56,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("function is type changing") {
-      GenericTensorAccessorW result = map_tensor_accessor(input, [](float x) -> bool { return x > 2; }, cpu_allocator);
+      GenericTensorAccessorW result = map_tensor_accessor(
+          input, [](float x) -> bool { return x > 2; }, cpu_allocator);
 
       auto at = [&](nonnegative_int r, nonnegative_int c) -> bool {
         return result.at<DataType::BOOL>(FFOrdered{r, c});
@@ -75,21 +77,26 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     GenericTensorAccessorW lhs = create_2d_accessor_w_with_contents<float>(
         {
-          {1, 3, 2},
-          {2, 1, 5},
+            {1, 3, 2},
+            {2, 1, 5},
         },
         cpu_allocator);
 
     SUBCASE("argument types are the same") {
       GenericTensorAccessorW rhs = create_2d_accessor_w_with_contents<float>(
           {
-            {0, 2, 5},
-            {3, 3, 8},
+              {0, 2, 5},
+              {3, 3, 8},
           },
           cpu_allocator);
 
       SUBCASE("function is not type changing") {
-        GenericTensorAccessorW result = map_tensor_accessors2(lhs, rhs, DataType::FLOAT, [](float l, float r) { return l + 2 * r; }, cpu_allocator);
+        GenericTensorAccessorW result = map_tensor_accessors2(
+            lhs,
+            rhs,
+            DataType::FLOAT,
+            [](float l, float r) { return l + 2 * r; },
+            cpu_allocator);
 
         auto at = [&](nonnegative_int r, nonnegative_int c) -> float {
           return result.at<DataType::FLOAT>(FFOrdered{r, c});
@@ -104,7 +111,12 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
 
       SUBCASE("function is type changing") {
-        GenericTensorAccessorW result = map_tensor_accessors2(lhs, rhs, DataType::BOOL, [](float l, float r) -> bool { return l > r; }, cpu_allocator);
+        GenericTensorAccessorW result = map_tensor_accessors2(
+            lhs,
+            rhs,
+            DataType::BOOL,
+            [](float l, float r) -> bool { return l > r; },
+            cpu_allocator);
 
         auto at = [&](nonnegative_int r, nonnegative_int c) -> bool {
           return result.at<DataType::BOOL>(FFOrdered{r, c});
@@ -122,19 +134,20 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("argument types are not the same") {
       GenericTensorAccessorW rhs = create_2d_accessor_w_with_contents<bool>(
           {
-            {true, false, true},
-            {true, false, false},
+              {true, false, true},
+              {true, false, false},
           },
           cpu_allocator);
 
       auto func = [](float l, bool r) -> double {
         if (r) {
-          return (- l);
+          return (-l);
         } else {
           return l * 2;
         }
       };
-      GenericTensorAccessorW result = map_tensor_accessors2(lhs, rhs, DataType::DOUBLE, func, cpu_allocator);
+      GenericTensorAccessorW result = map_tensor_accessors2(
+          lhs, rhs, DataType::DOUBLE, func, cpu_allocator);
 
       auto at = [&](nonnegative_int r, nonnegative_int c) -> double {
         return result.at<DataType::DOUBLE>(FFOrdered{r, c});
diff --git a/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc b/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc
index 0e69b3b937..a269cf4777 100644
--- a/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc
+++ b/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc
@@ -1,9 +1,9 @@
-#include <doctest/doctest.h>
 #include "kernels/reduce_tensor_accessor.h"
 #include "internal/test_utils.h"
+#include "kernels/create_accessor_with_contents.h"
 #include "kernels/format_accessor_contents.h"
 #include "test/utils/doctest/check_kv.h"
-#include "kernels/create_accessor_with_contents.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
@@ -11,56 +11,58 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("reduce_tensor_accessor_in_dims") {
     Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
-    GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents<int32_t>(
-        {
-          {
-            {1, 3, 2},
-            {2, 1, 5},
-          },
-          {
-            {4, 2, 1},
-            {8, 3, 6},
-          },
-        },
-        cpu_allocator);
+    GenericTensorAccessorR accessor =
+        create_3d_accessor_r_with_contents<int32_t>(
+            {
+                {
+                    {1, 3, 2},
+                    {2, 1, 5},
+                },
+                {
+                    {4, 2, 1},
+                    {8, 3, 6},
+                },
+            },
+            cpu_allocator);
 
     GenericTensorAccessorW result = reduce_tensor_accessor_in_dims(
-      accessor,
-      {ff_dim_t{0_n}, ff_dim_t{2_n}},
-      cpu_allocator,
-      [](int32_t accum, int32_t x) { return x + accum; });
+        accessor,
+        {ff_dim_t{0_n}, ff_dim_t{2_n}},
+        cpu_allocator,
+        [](int32_t accum, int32_t x) { return x + accum; });
 
-    GenericTensorAccessorW correct = create_1d_accessor_w_with_contents<int32_t>(
-        {
-          1 + 3 + 2 + 4 + 2 + 1,
-          2 + 1 + 5 + 8 + 3 + 6,
-        },
-        cpu_allocator);
+    GenericTensorAccessorW correct =
+        create_1d_accessor_w_with_contents<int32_t>(
+            {
+                1 + 3 + 2 + 4 + 2 + 1,
+                2 + 1 + 5 + 8 + 3 + 6,
+            },
+            cpu_allocator);
 
     CHECK_MESSAGE(accessors_are_equal(result, correct),
                   check_kv("result =", format_accessor_w_contents(result)),
                   check_kv("correct=", format_accessor_w_contents(correct)));
   }
 
-
   TEST_CASE("reduce_tensor_accessor_in_all_dims") {
     Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
-    GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents<int32_t>(
-        {
-          {
-            {1, 3, 2},
-            {2, 1, 5},
-          },
-          {
-            {4, 2, 1},
-            {8, 3, 6},
-          },
-        },
-        cpu_allocator);
+    GenericTensorAccessorR accessor =
+        create_3d_accessor_r_with_contents<int32_t>(
+            {
+                {
+                    {1, 3, 2},
+                    {2, 1, 5},
+                },
+                {
+                    {4, 2, 1},
+                    {8, 3, 6},
+                },
+            },
+            cpu_allocator);
 
     int32_t result = reduce_tensor_accessor_in_all_dims<DataType::INT32>(
-      accessor, [](int32_t accum, int32_t elem) { return accum + elem; });
+        accessor, [](int32_t accum, int32_t elem) { return accum + elem; });
     int32_t correct = 1 + 3 + 2 + 2 + 1 + 5 + 4 + 2 + 1 + 8 + 3 + 6;
 
     CHECK(result == correct);
diff --git a/lib/kernels/test/src/kernels/tensor_accessor_reductions.cc b/lib/kernels/test/src/kernels/tensor_accessor_reductions.cc
index 744b875ee7..46f746161f 100644
--- a/lib/kernels/test/src/kernels/tensor_accessor_reductions.cc
+++ b/lib/kernels/test/src/kernels/tensor_accessor_reductions.cc
@@ -1,7 +1,7 @@
-#include <doctest/doctest.h>
+#include "kernels/tensor_accessor_reductions.h"
 #include "kernels/create_accessor_with_contents.h"
 #include "kernels/local_cpu_allocator.h"
-#include "kernels/tensor_accessor_reductions.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
@@ -10,18 +10,19 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
     SUBCASE("returns false if any elements are false") {
-      GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents<bool>(
-          {
-            {
-              {true, true, true},
-              {true, true, true},
-            },
-            {
-              {true, false, true},
-              {true, true, true},
-            },
-          },
-          cpu_allocator);
+      GenericTensorAccessorR accessor =
+          create_3d_accessor_r_with_contents<bool>(
+              {
+                  {
+                      {true, true, true},
+                      {true, true, true},
+                  },
+                  {
+                      {true, false, true},
+                      {true, true, true},
+                  },
+              },
+              cpu_allocator);
 
       bool result = tensor_accessor_all(accessor);
       bool correct = false;
@@ -30,12 +31,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("returns true if all elements are true") {
-      GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents<bool>(
-          {
-            {true, true, true},
-            {true, true, true},
-          },
-          cpu_allocator);
+      GenericTensorAccessorR accessor =
+          create_2d_accessor_r_with_contents<bool>(
+              {
+                  {true, true, true},
+                  {true, true, true},
+              },
+              cpu_allocator);
 
       bool result = tensor_accessor_all(accessor);
       bool correct = true;
@@ -44,12 +46,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("throw an error if the datatype is not bool") {
-      GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents<int32_t>(
-          {
-            {1, 0, 1},
-            {1, 1, 1},
-          },
-          cpu_allocator);
+      GenericTensorAccessorR accessor =
+          create_2d_accessor_r_with_contents<int32_t>(
+              {
+                  {1, 0, 1},
+                  {1, 1, 1},
+              },
+              cpu_allocator);
 
       CHECK_THROWS(tensor_accessor_all(accessor));
     }
@@ -59,18 +62,19 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
     SUBCASE("returns true if any elements are true") {
-      GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents<bool>(
-          {
-            {
-              {false, false, false},
-              {true, false, false},
-            },
-            {
-              {false, false, false},
-              {false, false, false},
-            },
-          },
-          cpu_allocator);
+      GenericTensorAccessorR accessor =
+          create_3d_accessor_r_with_contents<bool>(
+              {
+                  {
+                      {false, false, false},
+                      {true, false, false},
+                  },
+                  {
+                      {false, false, false},
+                      {false, false, false},
+                  },
+              },
+              cpu_allocator);
 
       bool result = tensor_accessor_any(accessor);
       bool correct = true;
@@ -79,12 +83,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("returns false if all elements are false") {
-      GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents<bool>(
-          {
-            {false, false, false},
-            {false, false, false},
-          },
-          cpu_allocator);
+      GenericTensorAccessorR accessor =
+          create_2d_accessor_r_with_contents<bool>(
+              {
+                  {false, false, false},
+                  {false, false, false},
+              },
+              cpu_allocator);
 
       bool result = tensor_accessor_any(accessor);
       bool correct = false;
@@ -93,12 +98,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("throw an error if the datatype is not bool") {
-      GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents<int32_t>(
-          {
-            {1, 0, 1},
-            {1, 1, 1},
-          },
-          cpu_allocator);
+      GenericTensorAccessorR accessor =
+          create_2d_accessor_r_with_contents<int32_t>(
+              {
+                  {1, 0, 1},
+                  {1, 1, 1},
+              },
+              cpu_allocator);
 
       CHECK_THROWS(tensor_accessor_any(accessor));
     }
diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc
index 3b024fdf55..f80c080f11 100644
--- a/lib/kernels/test/src/test_attention_kernel.cc
+++ b/lib/kernels/test/src/test_attention_kernel.cc
@@ -20,9 +20,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     ManagedFFStream managed_stream{};
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-      /*workSpaceSize=*/1024 * 1024,
-      /*allowTensorOpMathConversion=*/true
-    );
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc
index 4ca8811b9b..dd98a36094 100644
--- a/lib/kernels/test/src/test_batch_matmul_kernel.cc
+++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc
@@ -16,9 +16,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     ManagedFFStream managed_stream{};
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-      /*workSpaceSize=*/1024 * 1024,
-      /*allowTensorOpMathConversion=*/true
-    );
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc
index 00a26c3303..534901daf2 100644
--- a/lib/kernels/test/src/test_batch_norm_kernel.cc
+++ b/lib/kernels/test/src/test_batch_norm_kernel.cc
@@ -14,9 +14,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     ManagedFFStream managed_stream{};
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-      /*workSpaceSize=*/1024 * 1024,
-      /*allowTensorOpMathConversion=*/true
-    );
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc
index 6ce415d48c..f3a2a8153d 100644
--- a/lib/kernels/test/src/test_combine_kernel.cc
+++ b/lib/kernels/test/src/test_combine_kernel.cc
@@ -7,9 +7,8 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Call Combine Forward and Backward Kernels") {
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-      /*workSpaceSize=*/1024 * 1024,
-      /*allowTensorOpMathConversion=*/true
-    );
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc
index b22add8905..397b5cdf90 100644
--- a/lib/kernels/test/src/test_concat_kernel.cc
+++ b/lib/kernels/test/src/test_concat_kernel.cc
@@ -7,9 +7,8 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test concat kernel forward and backward") {
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-      /*workSpaceSize=*/1024 * 1024,
-      /*allowTensorOpMathConversion=*/true
-    );
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
     ManagedFFStream managed_stream{};
     Allocator allocator = create_local_cuda_memory_allocator();
 
diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc
index 1b224084f8..c4518293dd 100644
--- a/lib/kernels/test/src/test_dropout.cc
+++ b/lib/kernels/test/src/test_dropout.cc
@@ -21,9 +21,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     ManagedFFStream managed_stream{};
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-      /*workSpaceSize=*/1024 * 1024,
-      /*allowTensorOpMathConversion=*/true
-    );
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc
index 98896cca18..14930e280b 100644
--- a/lib/kernels/test/src/test_flat_kernel.cc
+++ b/lib/kernels/test/src/test_flat_kernel.cc
@@ -9,9 +9,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     Allocator allocator = create_local_cuda_memory_allocator();
 
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-      /*workSpaceSize=*/1024 * 1024,
-      /*allowTensorOpMathConversion=*/true
-    );
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
     ManagedFFStream managed_stream{};
 
     TensorShape input_shape = TensorShape{
diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc
index 52389ea0f5..365fd3fb81 100644
--- a/lib/kernels/test/src/test_gather_kernels.cc
+++ b/lib/kernels/test/src/test_gather_kernels.cc
@@ -7,9 +7,8 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Gather Forward and Backward Kernel") {
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-      /*workSpaceSize=*/1024 * 1024,
-      /*allowTensorOpMathConversion=*/true
-    );
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc
index 4f3b701bba..3e63294e78 100644
--- a/lib/kernels/test/src/test_layer_norm_kernels.cc
+++ b/lib/kernels/test/src/test_layer_norm_kernels.cc
@@ -23,9 +23,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     };
 
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-      /*workSpaceSize=*/1024 * 1024,
-      /*allowTensorOpMathConversion=*/true
-    );
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc
index 099536ce0d..ed2d8dc2b6 100644
--- a/lib/kernels/test/src/test_managed_ff_stream.cc
+++ b/lib/kernels/test/src/test_managed_ff_stream.cc
@@ -7,9 +7,8 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test ManagedFFStream") {
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-      /*workSpaceSize=*/1024 * 1024,
-      /*allowTensorOpMathConversion=*/true
-    );
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
     ManagedFFStream managed_stream{};
     Allocator allocator = create_local_cuda_memory_allocator();
 
diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
index 058622e5cb..bfe3c363e4 100644
--- a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
+++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc
@@ -6,10 +6,10 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test ManagedPerDeviceFFHandle") {
     ManagedPerDeviceFFHandle base_handle{
-      /*num_ranks=*/1,
-      /*my_rank=*/0,
-      /*workSpaceSize=*/1024 * 1024,
-      /*allowTensorOpMathConversion=*/true,
+        /*num_ranks=*/1,
+        /*my_rank=*/0,
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true,
     };
     PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle();
 
@@ -26,10 +26,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     SUBCASE("move assignment operator") {
       SUBCASE("move assign to other") {
         ManagedPerDeviceFFHandle new_handle{
-          /*num_ranks=*/1,
-          /*my_rank=*/0,
-          /*workSpaceSize=*/1024 * 1024,
-          /*allowTensorOpMathConversion=*/true,
+            /*num_ranks=*/1,
+            /*my_rank=*/0,
+            /*workSpaceSize=*/1024 * 1024,
+            /*allowTensorOpMathConversion=*/true,
         };
         new_handle = std::move(base_handle);
         CHECK(&new_handle.raw_handle() == base_handle_ptr);
diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc
index 94ce8f4848..40a9eead53 100644
--- a/lib/kernels/test/src/test_partition_kernel.cc
+++ b/lib/kernels/test/src/test_partition_kernel.cc
@@ -8,9 +8,8 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Partition Forward and Backward") {
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-      /*workSpaceSize=*/1024 * 1024,
-      /*allowTensorOpMathConversion=*/true
-    );
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc
index 7691daf7a6..a999311b81 100644
--- a/lib/kernels/test/src/test_pool_2d_kernels.cc
+++ b/lib/kernels/test/src/test_pool_2d_kernels.cc
@@ -24,31 +24,30 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     PoolOp pool_type = PoolOp::MAX;
 
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-      /*workSpaceSize=*/1024 * 1024,
-      /*allowTensorOpMathConversion=*/true
-    );
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    Pool2DPerDeviceState state =
-        Kernels::Pool2D::init_kernel(/*handle=*/managed_handle.raw_handle(),
-                                     /*activation=*/std::nullopt,
-                                     /*input_w=*/input_w.int_from_positive_int(),
-                                     /*input_h=*/input_h.int_from_positive_int(),
-                                     /*input_c=*/input_c.int_from_positive_int(),
-                                     /*input_n=*/input_n.int_from_positive_int(),
-                                     /*output_w=*/output_w.int_from_positive_int(),
-                                     /*output_h=*/output_h.int_from_positive_int(),
-                                     /*output_c=*/output_c.int_from_positive_int(),
-                                     /*output_n=*/output_n.int_from_positive_int(),
-                                     /*pad_h=*/pad_h.unwrap_nonnegative(),
-                                     /*pad_w=*/pad_w.unwrap_nonnegative(),
-                                     /*kernel_h=*/kernel_h.int_from_positive_int(),
-                                     /*kernel_w=*/kernel_w.int_from_positive_int(),
-                                     /*stride_h=*/stride_h.int_from_positive_int(),
-                                     /*stride_w=*/stride_w.int_from_positive_int(),
-                                     /*pool_type=*/pool_type);
+    Pool2DPerDeviceState state = Kernels::Pool2D::init_kernel(
+        /*handle=*/managed_handle.raw_handle(),
+        /*activation=*/std::nullopt,
+        /*input_w=*/input_w.int_from_positive_int(),
+        /*input_h=*/input_h.int_from_positive_int(),
+        /*input_c=*/input_c.int_from_positive_int(),
+        /*input_n=*/input_n.int_from_positive_int(),
+        /*output_w=*/output_w.int_from_positive_int(),
+        /*output_h=*/output_h.int_from_positive_int(),
+        /*output_c=*/output_c.int_from_positive_int(),
+        /*output_n=*/output_n.int_from_positive_int(),
+        /*pad_h=*/pad_h.unwrap_nonnegative(),
+        /*pad_w=*/pad_w.unwrap_nonnegative(),
+        /*kernel_h=*/kernel_h.int_from_positive_int(),
+        /*kernel_w=*/kernel_w.int_from_positive_int(),
+        /*stride_h=*/stride_h.int_from_positive_int(),
+        /*stride_w=*/stride_w.int_from_positive_int(),
+        /*pool_type=*/pool_type);
 
     TensorShape input_shape = TensorShape{
         TensorDims{FFOrdered{input_n, input_c, input_h, input_w}},
diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc
index 16b03d34d9..e2c4c36a71 100644
--- a/lib/kernels/test/src/test_reduction_kernel.cc
+++ b/lib/kernels/test/src/test_reduction_kernel.cc
@@ -14,9 +14,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     };
 
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-      /*workSpaceSize=*/1024 * 1024,
-      /*allowTensorOpMathConversion=*/true
-    );
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index 95989776c1..5f58239a31 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -22,9 +22,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     };
 
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-      /*workSpaceSize=*/1024 * 1024,
-      /*allowTensorOpMathConversion=*/true
-    );
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
     ManagedFFStream managed_stream{};
 
     Allocator gpu_allocator = create_local_cuda_memory_allocator();
@@ -47,16 +46,18 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     }
 
     SUBCASE("backward_kernel") {
-      GenericTensorAccessorR output_grad = create_2d_accessor_r_with_contents<int32_t>(
-          {
-              {1, 2, 3},
-              {4, 3, 3},
-              {1, 3, 5},
-          },
-          gpu_allocator);
-
-      GenericTensorAccessorR correct = create_1d_accessor_r_with_contents<int32_t>(
-          {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator);
+      GenericTensorAccessorR output_grad =
+          create_2d_accessor_r_with_contents<int32_t>(
+              {
+                  {1, 2, 3},
+                  {4, 3, 3},
+                  {1, 3, 5},
+              },
+              gpu_allocator);
+
+      GenericTensorAccessorR correct =
+          create_1d_accessor_r_with_contents<int32_t>(
+              {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator);
 
       GenericTensorAccessorW input_grad =
           gpu_allocator.allocate_tensor(input_shape);
@@ -85,9 +86,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     };
 
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-      /*workSpaceSize=*/1024 * 1024,
-      /*allowTensorOpMathConversion=*/true
-    );
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
     ManagedFFStream managed_stream{};
 
     Allocator gpu_allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc
index 8c851e877e..066db28a17 100644
--- a/lib/kernels/test/src/test_reshape_kernel.cc
+++ b/lib/kernels/test/src/test_reshape_kernel.cc
@@ -6,9 +6,8 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Reshape Forward and Backward") {
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-      /*workSpaceSize=*/1024 * 1024,
-      /*allowTensorOpMathConversion=*/true
-    );
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index b9f97bc5cd..6a0ad84a92 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -14,9 +14,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     TensorShape output_shape = input_shape;
 
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-      /*workSpaceSize=*/1024 * 1024,
-      /*allowTensorOpMathConversion=*/true
-    );
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
@@ -61,9 +60,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     TensorShape output_shape = input_shape;
 
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-      /*workSpaceSize=*/1024 * 1024,
-      /*allowTensorOpMathConversion=*/true
-    );
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
     ManagedFFStream managed_stream{};
 
     Allocator gpu_allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc
index dc8cb276ab..bf10b5c633 100644
--- a/lib/kernels/test/src/test_softmax_kernel.cc
+++ b/lib/kernels/test/src/test_softmax_kernel.cc
@@ -13,9 +13,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     nonnegative_int channels = 100_n;
 
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-      /*workSpaceSize=*/1024 * 1024,
-      /*allowTensorOpMathConversion=*/true
-    );
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc
index d51d0e40f5..1c1c4d4d51 100644
--- a/lib/kernels/test/src/test_split_kernel.cc
+++ b/lib/kernels/test/src/test_split_kernel.cc
@@ -14,9 +14,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     coord_t num_blks = 1;
 
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-      /*workSpaceSize=*/1024 * 1024,
-      /*allowTensorOpMathConversion=*/true
-    );
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc
index 06b5add3c7..8560d33e5b 100644
--- a/lib/kernels/test/src/test_transpose_kernel.cc
+++ b/lib/kernels/test/src/test_transpose_kernel.cc
@@ -13,9 +13,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     };
 
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-      /*workSpaceSize=*/1024 * 1024,
-      /*allowTensorOpMathConversion=*/true
-    );
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/local-execution/include/local-execution/local_task_argument_accessor.h b/lib/local-execution/include/local-execution/local_task_argument_accessor.h
index d95545d1cc..184bf0b559 100644
--- a/lib/local-execution/include/local-execution/local_task_argument_accessor.h
+++ b/lib/local-execution/include/local-execution/local_task_argument_accessor.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H
 #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H
 
-#include "task-spec/task_argument_accessor.h"
 #include "task-spec/slot_tensor_type_id.dtg.h"
+#include "task-spec/task_argument_accessor.h"
 #include <unordered_map>
 #include <variant>
 
diff --git a/lib/local-execution/include/local-execution/loss_functions.h b/lib/local-execution/include/local-execution/loss_functions.h
index d625088be4..c75d4414de 100644
--- a/lib/local-execution/include/local-execution/loss_functions.h
+++ b/lib/local-execution/include/local-execution/loss_functions.h
@@ -16,10 +16,10 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_
 #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_
 
-#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/loss_functions.h"
 #include "pcg/tensor_guid_t.dtg.h"
 #include "task-spec/loss_tensor_t.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "task-spec/task_invocation.dtg.h"
 #include "task-spec/task_signature.h"
 
diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h
index 7b08036059..e4a9c78743 100644
--- a/lib/local-execution/include/local-execution/optimizer.h
+++ b/lib/local-execution/include/local-execution/optimizer.h
@@ -1,10 +1,10 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_
 #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_
 
-#include "task-spec/task_impl_function.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
 #include "pcg/optimizers/adam_optimizer_attrs.dtg.h"
 #include "pcg/optimizers/sgd_optimizer_attrs.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "task-spec/task_invocation.dtg.h"
 #include "task-spec/task_signature.h"
 
diff --git a/lib/local-execution/src/allocated_tensors.cc b/lib/local-execution/src/allocated_tensors.cc
index d400b4f815..ffaeaf285f 100644
--- a/lib/local-execution/src/allocated_tensors.cc
+++ b/lib/local-execution/src/allocated_tensors.cc
@@ -35,7 +35,8 @@ bool are_allocated_forward_tensors_valid(
       if (!is_allocated_tensor_backing_valid(
               TensorTypeVariant{tensor_guid},
               allocated_tensors.tensor_type_backings,
-              array_shape_from_tensor_shape(tensor_attrs.at(tensor_guid).shape))) {
+              array_shape_from_tensor_shape(
+                  tensor_attrs.at(tensor_guid).shape))) {
         return false;
       }
     } else {
@@ -58,8 +59,8 @@ bool are_allocated_gradient_tensors_valid(
         return false;
       }
 
-      ArrayShape tensor_guid_array_shape =
-          array_shape_from_tensor_shape(tensor_attrs.at(tensor_to_grad.first).shape);
+      ArrayShape tensor_guid_array_shape = array_shape_from_tensor_shape(
+          tensor_attrs.at(tensor_to_grad.first).shape);
       TensorTypeVariant gradient_tensor =
           TensorTypeVariant{tensor_to_grad.second};
       if (is_allocated_tensor_backing_valid(
@@ -100,8 +101,8 @@ bool are_allocated_optimizer_tensors_valid(
         return false;
       }
 
-      ArrayShape tensor_guid_array_shape =
-          array_shape_from_tensor_shape(tensor_attrs.at(tensor_to_optimizers.first).shape);
+      ArrayShape tensor_guid_array_shape = array_shape_from_tensor_shape(
+          tensor_attrs.at(tensor_to_optimizers.first).shape);
       for (optimizer_tensor_t const &optimizer_tensor :
            tensor_to_optimizers.second) {
         if (is_allocated_tensor_backing_valid(
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index 4b5ee0b782..3b1bb0fd2d 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -1,12 +1,12 @@
 #include "local-execution/local_training_backing.h"
 #include "local-execution/loss_functions.h"
 #include "local-execution/optimizer.h"
-#include "task-spec/task_signature_impl.h"
 #include "local-execution/unallocated_tensors.h"
 #include "pcg/computation_graph.h"
 #include "pcg/optimizer_attrs.h"
 #include "task-spec/op_task_to_task_invocation.h"
 #include "task-spec/task_invocation.h"
+#include "task-spec/task_signature_impl.h"
 #include "utils/containers/contains.h"
 #include "utils/containers/contains_key.h"
 #include "utils/containers/get_only.h"
diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc
index 974e580b8e..c23159a85d 100644
--- a/lib/local-execution/src/loss_functions.cc
+++ b/lib/local-execution/src/loss_functions.cc
@@ -55,8 +55,7 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
   auto logit_grad = acc.get_tensor_grad<Permissions::RW>(LOGIT_GRAD);
   auto logit = acc.get_tensor<Permissions::RO>(LOGIT);
   auto label = acc.get_loss_tensor<Permissions::RO>(LABEL);
-  int batch_size =
-      logit.shape.at(legion_dim_t{1_n}).int_from_positive_int();
+  int batch_size = logit.shape.at(legion_dim_t{1_n}).int_from_positive_int();
   // assuming logit shape is [batch dim, num classes]
 
   LossFunction loss_type = get_loss_function(attrs);
@@ -70,29 +69,26 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
     // label shape is [batch dim, 1]
     auto scce_attrs = attrs.get<SparseCategoricalCrossEntropyLossAttrs>();
     size_t ndim = logit.shape.num_dims().unwrap_nonnegative();
-    int num_classes =
-        logit.shape.at(legion_dim_t{0_n}).int_from_positive_int();
+    int num_classes = logit.shape.at(legion_dim_t{0_n}).int_from_positive_int();
     ASSERT(logit_grad.shape == logit.shape);
     int k = 1;
     if (scce_attrs.replace_labels) {
       k = logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1}))
               .int_from_positive_int() /
           label.shape.at(legion_dim_t(nonnegative_int{ndim - 1}))
-              .int_from_positive_int(); // TODO FIXME something seems wrong here,
-                                     // isn't the numerator guaranteed to be 1?
-                                     // <--- this is not the case because of the
-                                     // potential parallel dim
+              .int_from_positive_int(); // TODO FIXME something seems wrong
+                                        // here, isn't the numerator guaranteed
+                                        // to be 1?
+                                        // <--- this is not the case because of
+                                        // the potential parallel dim
     }
-    ASSERT(
-        label.shape.sub_shape(legion_dim_t(1_n), std::nullopt) ==
-        logit.shape.sub_shape(legion_dim_t(1_n), std::nullopt));
+    ASSERT(label.shape.sub_shape(legion_dim_t(1_n), std::nullopt) ==
+           logit.shape.sub_shape(legion_dim_t(1_n), std::nullopt));
     ASSERT(k * label.shape.at(legion_dim_t(nonnegative_int{ndim - 1}))
                    .int_from_positive_int() ==
            logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1}))
                .int_from_positive_int());
-    ASSERT(
-        label.shape.at(legion_dim_t(0_n)).int_from_positive_int() ==
-        1);
+    ASSERT(label.shape.at(legion_dim_t(0_n)).int_from_positive_int() == 1);
 
     profile(sparse_categorical_crossentropy_loss_backward_kernel,
             profiling,
diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc
index 0acc3d865d..ae3d97daa4 100644
--- a/lib/local-execution/src/task_registry.cc
+++ b/lib/local-execution/src/task_registry.cc
@@ -1,6 +1,6 @@
 #include "local-execution/task_registry.h"
-#include "task-spec/task_signature_impl.h"
 #include "pcg/computation_graph.h"
+#include "task-spec/task_signature_impl.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/test/src/test_allocated_tensors.cc b/lib/local-execution/test/src/test_allocated_tensors.cc
index 971b09356c..3242ca79ad 100644
--- a/lib/local-execution/test/src/test_allocated_tensors.cc
+++ b/lib/local-execution/test/src/test_allocated_tensors.cc
@@ -1,6 +1,6 @@
+#include "kernels/local_cpu_allocator.h"
 #include "local-execution/allocated_tensors.h"
 #include "local-execution/gradient_tensor_source.h"
-#include "kernels/local_cpu_allocator.h"
 #include "local-execution/loss_tensor_source.h"
 #include "local-execution/optimizer_tensor_source.h"
 #include "pcg/computation_graph.dtg.h"
@@ -29,16 +29,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     tensor_guid_t dangling_tensor = tensor_guid_source.new_mock_tensor_guid();
 
     TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{
-        TensorShape{TensorDims{FFOrdered{16_p, 10_p}},
-                    DataType::FLOAT},
+        TensorShape{TensorDims{FFOrdered{16_p, 10_p}}, DataType::FLOAT},
         CreateGrad::NO};
     TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{
-        TensorShape{TensorDims{FFOrdered{16_p, 20_p}},
-                    DataType::FLOAT},
+        TensorShape{TensorDims{FFOrdered{16_p, 20_p}}, DataType::FLOAT},
         CreateGrad::NO};
     TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{
-        TensorShape{TensorDims{FFOrdered{16_p, 30_p}},
-                    DataType::FLOAT},
+        TensorShape{TensorDims{FFOrdered{16_p, 30_p}}, DataType::FLOAT},
         CreateGrad::YES};
 
     GenericTensorAccessorW tensor_backing_1 =
diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc
index 2494ff1943..8827e0269d 100644
--- a/lib/local-execution/test/src/test_e2e.cc
+++ b/lib/local-execution/test/src/test_e2e.cc
@@ -18,13 +18,12 @@
 
 using namespace ::FlexFlow;
 
-bool did_loss_decrease(
-  GenericTensorAccessorR const &first_epoch, 
-  GenericTensorAccessorR const &last_epoch
-) {
+bool did_loss_decrease(GenericTensorAccessorR const &first_epoch,
+                       GenericTensorAccessorR const &last_epoch) {
   Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
-  return tensor_accessor_all(compare_tensor_accessors_le(last_epoch, first_epoch, cpu_allocator));
+  return tensor_accessor_all(
+      compare_tensor_accessors_le(last_epoch, first_epoch, cpu_allocator));
 }
 
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
@@ -32,9 +31,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     // initialize runtime
     ManagedFFStream managed_stream{};
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-      /*workSpaceSize=*/1024 * 1024,
-      /*allowTensorOpMathConversion=*/true
-    );
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
@@ -48,32 +46,28 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     positive_int output_dim = 1_p;
 
     TensorShape output_tensor_shape = TensorShape{
-        TensorDims{FFOrdered{batch_size, output_dim}},
-        DataType::FLOAT};
+        TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
 
     GenericTensorAccessorW label_tensor_backing =
         allocator.allocate_tensor(output_tensor_shape);
     AllocatedTensors allocated_tensors = AllocatedTensors{
         /*tensor_type_backings=*/{
-          {TensorTypeVariant{label_tensor}, label_tensor_backing},
-        }, 
-        /*gradient_mapping=*/{}, 
-        /*optimizer_mapping*/{},
+            {TensorTypeVariant{label_tensor}, label_tensor_backing},
+        },
+        /*gradient_mapping=*/{},
+        /*optimizer_mapping*/ {},
     };
 
     // construct computation graph
     ComputationGraph computation_graph = make_empty_computation_graph();
 
     TensorShape input_tensor_shape = TensorShape{
-        TensorDims{FFOrdered{batch_size, data_dim}},
-        DataType::FLOAT};
+        TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
 
     TensorShape weight_shape_1 = TensorShape{
-        TensorDims{FFOrdered{data_dim, hidden_dim}},
-        DataType::FLOAT};
+        TensorDims{FFOrdered{data_dim, hidden_dim}}, DataType::FLOAT};
     TensorShape weight_shape_2 = TensorShape{
-        TensorDims{FFOrdered{hidden_dim, output_dim}},
-        DataType::FLOAT};
+        TensorDims{FFOrdered{hidden_dim, output_dim}}, DataType::FLOAT};
 
     LayerAddedResult inputs_layer =
         add_input_layer_with_grad(computation_graph, input_tensor_shape);
@@ -162,16 +156,14 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       model_training_instance.forward();
       model_training_instance.backward();
       model_training_instance.update();
-      loss_values.push_back(
-        copy_tensor_accessor_r(
-          model_training_instance.get_loss_tensor_accessor(),
-          cpu_allocator));
+      loss_values.push_back(copy_tensor_accessor_r(
+          model_training_instance.get_loss_tensor_accessor(), cpu_allocator));
     }
 
     // Assert that each sample in the batch has a lower loss in last epoch than
     // the first epoch
     GenericTensorAccessorR first_epoch_loss = loss_values.at(0);
     GenericTensorAccessorR last_epoch = loss_values.back();
-    CHECK(did_loss_decrease( first_epoch_loss, last_epoch));
+    CHECK(did_loss_decrease(first_epoch_loss, last_epoch));
   }
 }
diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc
index 71148d06c1..42b88aa6bc 100644
--- a/lib/local-execution/test/src/test_local_cost_estimator.cc
+++ b/lib/local-execution/test/src/test_local_cost_estimator.cc
@@ -12,9 +12,8 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("LocalCostEstimator") {
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-      /*workSpaceSize=*/1024 * 1024,
-      /*allowTensorOpMathConversion=*/true
-    );
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
 
     RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
         DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
diff --git a/lib/local-execution/test/src/test_local_task_arg_accessor.cc b/lib/local-execution/test/src/test_local_task_arg_accessor.cc
index e817b6fd8e..5c11010e2a 100644
--- a/lib/local-execution/test/src/test_local_task_arg_accessor.cc
+++ b/lib/local-execution/test/src/test_local_task_arg_accessor.cc
@@ -18,8 +18,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     DataType dtype = DataType::FLOAT;
     TensorShape input_tensor_shape = TensorShape{
-        TensorDims{
-            FFOrdered{batch_size, seq_len, feature_size}},
+        TensorDims{FFOrdered{batch_size, seq_len, feature_size}},
         DataType::FLOAT,
     };
 
diff --git a/lib/local-execution/test/src/test_local_tensor_backing.cc b/lib/local-execution/test/src/test_local_tensor_backing.cc
index df787fcd6f..bba0bd28ce 100644
--- a/lib/local-execution/test/src/test_local_tensor_backing.cc
+++ b/lib/local-execution/test/src/test_local_tensor_backing.cc
@@ -94,12 +94,10 @@ TEST_SUITE(FF_TEST_SUITE) {
           tensor_guid_source.new_mock_tensor_guid();
 
       TensorAttrs allocated_tensor_attrs = TensorAttrs{
-          TensorShape{TensorDims{FFOrdered{16_p, 10_p}},
-                      DataType::FLOAT},
+          TensorShape{TensorDims{FFOrdered{16_p, 10_p}}, DataType::FLOAT},
           CreateGrad::NO};
       TensorAttrs unallocated_tensor_attrs = TensorAttrs{
-          TensorShape{TensorDims{FFOrdered{16_p, 20_p}},
-                      DataType::FLOAT},
+          TensorShape{TensorDims{FFOrdered{16_p, 20_p}}, DataType::FLOAT},
           CreateGrad::YES};
 
       GenericTensorAccessorW allocated_tensor_backing =
diff --git a/lib/local-execution/test/src/test_loss_functions.cc b/lib/local-execution/test/src/test_loss_functions.cc
index 5a9347e37b..d741d4d8d4 100644
--- a/lib/local-execution/test/src/test_loss_functions.cc
+++ b/lib/local-execution/test/src/test_loss_functions.cc
@@ -18,9 +18,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     // initialize runtime
     ManagedFFStream managed_stream{};
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-      /*workSpaceSize=*/1024 * 1024,
-      /*allowTensorOpMathConversion=*/true
-    );
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
@@ -36,11 +35,9 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     positive_int output_dim = 32_p;
 
     TensorShape output_tensor_shape = TensorShape{
-        TensorDims{FFOrdered{batch_size, output_dim}},
-        DataType::FLOAT};
+        TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
     TensorShape reduced_tensor_shape =
-        TensorShape{TensorDims{FFOrdered{batch_size, 1_p}},
-                    DataType::FLOAT};
+        TensorShape{TensorDims{FFOrdered{batch_size, 1_p}}, DataType::FLOAT};
 
     GenericTensorAccessorW label_for_nonconfigurable_loss_attrs_backing =
         allocator.allocate_tensor(output_tensor_shape);
@@ -58,12 +55,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     ComputationGraph computation_graph = make_empty_computation_graph();
 
     TensorShape input_tensor_shape = TensorShape{
-        TensorDims{FFOrdered{batch_size, data_dim}},
-        DataType::FLOAT};
+        TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
 
     TensorShape weight_shape = TensorShape{
-        TensorDims{FFOrdered{data_dim, output_dim}},
-        DataType::FLOAT};
+        TensorDims{FFOrdered{data_dim, output_dim}}, DataType::FLOAT};
 
     LayerAddedResult inputs_layer =
         add_input_layer(computation_graph, input_tensor_shape);
diff --git a/lib/local-execution/test/src/test_task_registry.cc b/lib/local-execution/test/src/test_task_registry.cc
index ea20eb0fa0..4bcfa7fe17 100644
--- a/lib/local-execution/test/src/test_task_registry.cc
+++ b/lib/local-execution/test/src/test_task_registry.cc
@@ -1,8 +1,8 @@
 #include "doctest/doctest.h"
 #include "kernels/local_cuda_allocator.h"
 #include "local-execution/local_cost_estimator.h"
-#include "task-spec/task_signature_impl.h"
 #include "pcg/computation_graph_builder.h"
+#include "task-spec/task_signature_impl.h"
 #include "utils/fmt/optional.h"
 #include "utils/fmt/unordered_map.h"
 
diff --git a/lib/local-execution/test/src/test_unallocated_tensors.cc b/lib/local-execution/test/src/test_unallocated_tensors.cc
index 7a2650b447..0a0b99e61c 100644
--- a/lib/local-execution/test/src/test_unallocated_tensors.cc
+++ b/lib/local-execution/test/src/test_unallocated_tensors.cc
@@ -1,6 +1,6 @@
+#include "kernels/local_cpu_allocator.h"
 #include "local-execution/allocated_tensors.h"
 #include "local-execution/gradient_tensor_source.h"
-#include "kernels/local_cpu_allocator.h"
 #include "local-execution/loss_tensor_source.h"
 #include "local-execution/optimizer_tensor_source.h"
 #include "local-execution/unallocated_tensors.h"
@@ -38,16 +38,13 @@ TEST_SUITE(FF_TEST_SUITE) {
         optimizer_tensor_source.new_optimizer_tensor();
 
     TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{
-        TensorShape{TensorDims{FFOrdered{16_p, 10_p}},
-                    DataType::FLOAT},
+        TensorShape{TensorDims{FFOrdered{16_p, 10_p}}, DataType::FLOAT},
         CreateGrad::NO};
     TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{
-        TensorShape{TensorDims{FFOrdered{16_p, 20_p}},
-                    DataType::FLOAT},
+        TensorShape{TensorDims{FFOrdered{16_p, 20_p}}, DataType::FLOAT},
         CreateGrad::NO};
     TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{
-        TensorShape{TensorDims{FFOrdered{16_p, 30_p}},
-                    DataType::FLOAT},
+        TensorShape{TensorDims{FFOrdered{16_p, 30_p}}, DataType::FLOAT},
         CreateGrad::YES};
 
     GenericTensorAccessorW tensor_backing_1 =
diff --git a/lib/local-execution/test/src/test_update.cc b/lib/local-execution/test/src/test_update.cc
index 6ffe002f22..54c64e6b6c 100644
--- a/lib/local-execution/test/src/test_update.cc
+++ b/lib/local-execution/test/src/test_update.cc
@@ -16,9 +16,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     // initialize runtime configs
     ManagedFFStream managed_stream{};
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-      /*workSpaceSize=*/1024 * 1024,
-      /*allowTensorOpMathConversion=*/true
-    );
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
 
     Allocator allocator = create_local_cuda_memory_allocator();
     AllocatedTensors allocated_tensors = make_empty_allocated_tensors();
@@ -31,12 +30,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     positive_int output_dim = 32_p;
 
     TensorShape input_tensor_shape = TensorShape{
-        TensorDims{FFOrdered{batch_size, data_dim}},
-        DataType::FLOAT};
+        TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
 
     TensorShape weight_shape = TensorShape{
-        TensorDims{FFOrdered{data_dim, output_dim}},
-        DataType::FLOAT};
+        TensorDims{FFOrdered{data_dim, output_dim}}, DataType::FLOAT};
 
     LayerAddedResult inputs_layer =
         add_input_layer(computation_graph, input_tensor_shape);
diff --git a/lib/models/src/models/dlrm/dlrm.cc b/lib/models/src/models/dlrm/dlrm.cc
index 5d56909fec..d1dd52b4da 100644
--- a/lib/models/src/models/dlrm/dlrm.cc
+++ b/lib/models/src/models/dlrm/dlrm.cc
@@ -143,17 +143,17 @@ ComputationGraph get_dlrm_computation_graph(DLRMConfig const &config) {
       /*input=*/dense_input,
       /*mlp_layers=*/config.dense_arch_layer_sizes);
 
-  std::vector<tensor_guid_t> emb_outputs = transform(
-      zip(config.embedding_size, sparse_inputs),
-      [&](std::pair<positive_int, tensor_guid_t> const &combined_pair)
-          -> tensor_guid_t {
-        return create_dlrm_sparse_embedding_network(
-            /*cgb=*/cgb,
-            /*config=*/config,
-            /*input=*/combined_pair.second,
-            /*input_dim=*/combined_pair.first,
-            /*output_dim=*/config.embedding_dim);
-      });
+  std::vector<tensor_guid_t> emb_outputs =
+      transform(zip(config.embedding_size, sparse_inputs),
+                [&](std::pair<positive_int, tensor_guid_t> const &combined_pair)
+                    -> tensor_guid_t {
+                  return create_dlrm_sparse_embedding_network(
+                      /*cgb=*/cgb,
+                      /*config=*/config,
+                      /*input=*/combined_pair.second,
+                      /*input_dim=*/combined_pair.first,
+                      /*output_dim=*/config.embedding_dim);
+                });
 
   tensor_guid_t interacted_features = create_dlrm_interact_features(
       /*cgb=*/cgb,
diff --git a/lib/op-attrs/include/op-attrs/datatype.h b/lib/op-attrs/include/op-attrs/datatype.h
index 62f7ccd4f9..ad45dcb13c 100644
--- a/lib/op-attrs/include/op-attrs/datatype.h
+++ b/lib/op-attrs/include/op-attrs/datatype.h
@@ -13,58 +13,53 @@ template <DataType>
 struct data_type_enum_to_class;
 
 template <>
-struct data_type_enum_to_class<DataType::FLOAT>
-  : type_identity<float> {};
+struct data_type_enum_to_class<DataType::FLOAT> : type_identity<float> {};
 
 template <>
-struct data_type_enum_to_class<DataType::DOUBLE>
-  : type_identity<double> {};
+struct data_type_enum_to_class<DataType::DOUBLE> : type_identity<double> {};
 
 template <>
-struct data_type_enum_to_class<DataType::INT32>
-  : type_identity<int32_t> {};
+struct data_type_enum_to_class<DataType::INT32> : type_identity<int32_t> {};
 
 template <>
-struct data_type_enum_to_class<DataType::INT64>
-  : type_identity<int64_t> {};
+struct data_type_enum_to_class<DataType::INT64> : type_identity<int64_t> {};
 
 template <>
-struct data_type_enum_to_class<DataType::HALF> 
-  : type_identity<half> {};
+struct data_type_enum_to_class<DataType::HALF> : type_identity<half> {};
 
 template <>
-struct data_type_enum_to_class<DataType::BOOL>
-  : type_identity<bool> {};
+struct data_type_enum_to_class<DataType::BOOL> : type_identity<bool> {};
 
 template <typename T>
 struct type_to_data_type_enum;
 
 template <>
 struct type_to_data_type_enum<double>
-  : std::integral_constant<DataType, DataType::DOUBLE> {};
+    : std::integral_constant<DataType, DataType::DOUBLE> {};
 
 template <>
 struct type_to_data_type_enum<float>
-  : std::integral_constant<DataType, DataType::FLOAT> {};
+    : std::integral_constant<DataType, DataType::FLOAT> {};
 
 template <>
 struct type_to_data_type_enum<half>
-  : std::integral_constant<DataType, DataType::HALF> {};
+    : std::integral_constant<DataType, DataType::HALF> {};
 
 template <>
 struct type_to_data_type_enum<int32_t>
-  : std::integral_constant<DataType, DataType::INT32> {};
+    : std::integral_constant<DataType, DataType::INT32> {};
 
 template <>
 struct type_to_data_type_enum<int64_t>
-  : std::integral_constant<DataType, DataType::INT64> {};
+    : std::integral_constant<DataType, DataType::INT64> {};
 
 template <>
 struct type_to_data_type_enum<bool>
-  : std::integral_constant<DataType, DataType::BOOL> {};
+    : std::integral_constant<DataType, DataType::BOOL> {};
 
 template <typename T>
-inline constexpr DataType type_to_data_type_enum_v = type_to_data_type_enum<T>::value;
+inline constexpr DataType type_to_data_type_enum_v =
+    type_to_data_type_enum<T>::value;
 
 template <DataType DT, typename T>
 typename data_type_enum_to_class<DT>::type cast_to(T t) {
diff --git a/lib/op-attrs/include/op-attrs/initializers/kaiming_initializer_mode.h b/lib/op-attrs/include/op-attrs/initializers/kaiming_initializer_mode.h
index f619f94e20..c5c967d5c2 100644
--- a/lib/op-attrs/include/op-attrs/initializers/kaiming_initializer_mode.h
+++ b/lib/op-attrs/include/op-attrs/initializers/kaiming_initializer_mode.h
@@ -14,7 +14,7 @@ namespace FlexFlow {
  * https://github.com/pytorch/pytorch/blob/bd019c0bb485904a99fb38589444b1461ab1e486/torch/nn/init.py#L345-L363
  */
 positive_int calculate_fan_for_mode(TensorDims const &dims,
-                                       KaimingInitializerMode mode);
+                                    KaimingInitializerMode mode);
 
 } // namespace FlexFlow
 
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
index bb374d98ee..435a962963 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
@@ -18,11 +18,11 @@ nonnegative_int num_shard_dims(ParallelTensorDims const &);
 ParallelTensorDimDegrees get_parallel_degrees(ParallelTensorDims const &);
 
 ParallelTensorDims lift_to_parallel(TensorDims const &);
-ParallelTensorDims lift_to_parallel_with_degrees(
-    TensorDims const &,
-    SumDegree const &,
-    DiscardCopyDegree const &,
-    FFOrdered<positive_int> const &shard_degrees);
+ParallelTensorDims
+    lift_to_parallel_with_degrees(TensorDims const &,
+                                  SumDegree const &,
+                                  DiscardCopyDegree const &,
+                                  FFOrdered<positive_int> const &shard_degrees);
 ParallelTensorDims
     lift_to_parallel_with_degrees(TensorDims const &,
                                   ParallelTensorDimDegrees const &);
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
index 96d9bfb06a..e366f99b8e 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
@@ -17,8 +17,7 @@ ShardParallelDim shard_dim_at_idx(ParallelTensorShape const &,
                                   relative_ff_dim_t);
 ShardParallelDim &shard_dim_at_idx(ParallelTensorShape &, relative_ff_dim_t);
 
-FFOrdered<positive_int>
-    ff_ordered_shard_degrees(ParallelTensorShape const &);
+FFOrdered<positive_int> ff_ordered_shard_degrees(ParallelTensorShape const &);
 
 std::optional<ShardParallelDim>
     try_get_shard_dim_at_idx(ParallelTensorShape const &, relative_ff_dim_t);
@@ -26,11 +25,11 @@ std::optional<ShardParallelDim>
 ParallelTensorDimDegrees get_parallel_degrees(ParallelTensorShape const &);
 
 ParallelTensorShape lift_to_parallel(TensorShape const &);
-ParallelTensorShape lift_to_parallel_with_degrees(
-    TensorShape const &,
-    SumDegree const &,
-    DiscardCopyDegree const &,
-    FFOrdered<positive_int> const &shard_degrees);
+ParallelTensorShape
+    lift_to_parallel_with_degrees(TensorShape const &,
+                                  SumDegree const &,
+                                  DiscardCopyDegree const &,
+                                  FFOrdered<positive_int> const &shard_degrees);
 ParallelTensorShape
     lift_to_parallel_with_degrees(TensorShape const &,
                                   ParallelTensorDimDegrees const &);
diff --git a/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h b/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h
index 85cea57523..28c48620a9 100644
--- a/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h
+++ b/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h
@@ -9,7 +9,7 @@ namespace FlexFlow {
 
 ReplicaParallelDimSet empty_replica_parallel_dim_set();
 positive_int get_degree_of_replica_type(ReplicaParallelDimSet const &,
-                                           ReplicaType);
+                                        ReplicaType);
 std::unordered_set<ReplicaParallelDim>
     get_replica_dims(ReplicaParallelDimSet const &);
 
diff --git a/lib/op-attrs/src/op-attrs/datatype_value.cc b/lib/op-attrs/src/op-attrs/datatype_value.cc
index dfb77dac5d..a4abde2cb4 100644
--- a/lib/op-attrs/src/op-attrs/datatype_value.cc
+++ b/lib/op-attrs/src/op-attrs/datatype_value.cc
@@ -24,12 +24,12 @@ DataTypeValue make_bool_data_type_value(bool value) {
 }
 
 DataType get_data_type_of_data_type_value(DataTypeValue value) {
-  return value.visit<DataType>(overload {
-    [](float) { return DataType::FLOAT; },
-    [](double) { return DataType::DOUBLE; },
-    [](int32_t) { return DataType::INT32; },
-    [](int64_t) { return DataType::INT64; },
-    [](bool) { return DataType::BOOL; },
+  return value.visit<DataType>(overload{
+      [](float) { return DataType::FLOAT; },
+      [](double) { return DataType::DOUBLE; },
+      [](int32_t) { return DataType::INT32; },
+      [](int64_t) { return DataType::INT64; },
+      [](bool) { return DataType::BOOL; },
   });
 }
 
diff --git a/lib/op-attrs/src/op-attrs/initializers/kaiming_initializer_mode.cc b/lib/op-attrs/src/op-attrs/initializers/kaiming_initializer_mode.cc
index 789903dc66..aee2256036 100644
--- a/lib/op-attrs/src/op-attrs/initializers/kaiming_initializer_mode.cc
+++ b/lib/op-attrs/src/op-attrs/initializers/kaiming_initializer_mode.cc
@@ -4,7 +4,7 @@
 namespace FlexFlow {
 
 positive_int calculate_fan_for_mode(TensorDims const &dims,
-                                       KaimingInitializerMode mode) {
+                                    KaimingInitializerMode mode) {
   positive_int num_input_fmaps = dim_at_idx(dims, relative_ff_dim_t{0});
   positive_int num_output_fmaps = dim_at_idx(dims, relative_ff_dim_t{1});
 
diff --git a/lib/op-attrs/src/op-attrs/ops/attention.cc b/lib/op-attrs/src/op-attrs/ops/attention.cc
index c5678e7bde..5800f086ef 100644
--- a/lib/op-attrs/src/op-attrs/ops/attention.cc
+++ b/lib/op-attrs/src/op-attrs/ops/attention.cc
@@ -69,8 +69,7 @@ positive_int get_vSize(MultiHeadAttentionInputs const &inputs) {
   return inputs.value_size;
 }
 
-positive_int
-    get_kvSeqLength(MultiHeadAttentionParallelInputs const &inputs) {
+positive_int get_kvSeqLength(MultiHeadAttentionParallelInputs const &inputs) {
   return inputs.sequence_dim.size;
 }
 
@@ -78,8 +77,7 @@ positive_int get_kvSeqLength(MultiHeadAttentionInputs const &inputs) {
   return inputs.sequence_length;
 }
 
-positive_int
-    get_qoSeqLength(MultiHeadAttentionParallelInputs const &inputs) {
+positive_int get_qoSeqLength(MultiHeadAttentionParallelInputs const &inputs) {
   return inputs.sequence_dim.size; // FIXME -- assumes only prefill
 }
 
@@ -87,8 +85,7 @@ positive_int get_qoSeqLength(MultiHeadAttentionInputs const &inputs) {
   return inputs.sequence_length; // FIXME -- assumes only prefil
 }
 
-positive_int
-    get_num_samples(MultiHeadAttentionParallelInputs const &inputs) {
+positive_int get_num_samples(MultiHeadAttentionParallelInputs const &inputs) {
   return inputs.batch_dim.size;
 }
 
diff --git a/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc b/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc
index d11a8aba10..33c4987233 100644
--- a/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc
+++ b/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc
@@ -152,10 +152,9 @@ tl::expected<ParallelTensorShape, std::string>
   ShardParallelDim output_p = p;
 
   positive_int output_discard_copy_degree = 1_p;
-  positive_int output_sum_degree = positive_int{
-      get_total_parallel_degree(input_lhs) /
-      (output_b.degree * output_n.degree * output_p.degree)
-  };
+  positive_int output_sum_degree =
+      positive_int{get_total_parallel_degree(input_lhs) /
+                   (output_b.degree * output_n.degree * output_p.degree)};
 
   ParallelTensorShape result = ParallelTensorShape{
       ParallelTensorDims{
diff --git a/lib/op-attrs/src/op-attrs/ops/combine.cc b/lib/op-attrs/src/op-attrs/ops/combine.cc
index c55bdc55bb..64e9316ea2 100644
--- a/lib/op-attrs/src/op-attrs/ops/combine.cc
+++ b/lib/op-attrs/src/op-attrs/ops/combine.cc
@@ -44,10 +44,10 @@ tl::expected<ParallelTensorShape, std::string>
   }
 
   ParallelTensorShape output = input;
-  relative_ff_dim_t combine_dim = relative_ff_dim_t_from_ff_dim_t(attrs.combine_dim);
+  relative_ff_dim_t combine_dim =
+      relative_ff_dim_t_from_ff_dim_t(attrs.combine_dim);
   shard_dim_at_idx(output, combine_dim).degree = positive_int{
-    shard_dim_at_idx(output, combine_dim).degree / attrs.combine_degree
-  };
+      shard_dim_at_idx(output, combine_dim).degree / attrs.combine_degree};
 
   return output;
 }
diff --git a/lib/op-attrs/src/op-attrs/ops/concat.cc b/lib/op-attrs/src/op-attrs/ops/concat.cc
index b41d1ffc32..aed118dd62 100644
--- a/lib/op-attrs/src/op-attrs/ops/concat.cc
+++ b/lib/op-attrs/src/op-attrs/ops/concat.cc
@@ -17,8 +17,7 @@ tl::expected<TensorShape, std::string>
     get_output_shape(ConcatAttrs const &attrs,
                      std::vector<TensorShape> const &inputs) {
   auto get_non_axis_dims = [&](TensorShape const &s) {
-    std::map<ff_dim_t, positive_int> dim_sizes =
-        enumerate(ff_ordered(s.dims));
+    std::map<ff_dim_t, positive_int> dim_sizes = enumerate(ff_ordered(s.dims));
     dim_sizes.erase(attrs.axis);
     return dim_sizes;
   };
diff --git a/lib/op-attrs/src/op-attrs/ops/conv_2d.cc b/lib/op-attrs/src/op-attrs/ops/conv_2d.cc
index af4b6cd898..2ac90c1c9c 100644
--- a/lib/op-attrs/src/op-attrs/ops/conv_2d.cc
+++ b/lib/op-attrs/src/op-attrs/ops/conv_2d.cc
@@ -51,9 +51,9 @@ TensorShape get_bias_shape(Conv2DAttrs const &attrs,
 }
 
 static positive_int calculate_output_size(positive_int input_size,
-                                             nonnegative_int padding_size,
-                                             positive_int kernel_size,
-                                             positive_int stride) {
+                                          nonnegative_int padding_size,
+                                          positive_int kernel_size,
+                                          positive_int stride) {
   int input_size_raw = input_size.int_from_positive_int();
   int padding_raw = padding_size.unwrap_nonnegative();
   int kernel_size_raw = kernel_size.int_from_positive_int();
diff --git a/lib/op-attrs/src/op-attrs/ops/linear.cc b/lib/op-attrs/src/op-attrs/ops/linear.cc
index 578e9ce652..32791e81a9 100644
--- a/lib/op-attrs/src/op-attrs/ops/linear.cc
+++ b/lib/op-attrs/src/op-attrs/ops/linear.cc
@@ -209,8 +209,8 @@ tl::expected<std::vector<InitializerAttrs>, std::string> get_initializers(
   InitializerAttrs projection_initializer =
       maybe_projection_initializer.value_or(projection_default_initializer);
 
-  positive_int fan_in = calculate_fan_for_mode(
-      projection_shape.dims, KaimingInitializerMode::FAN_IN);
+  positive_int fan_in = calculate_fan_for_mode(projection_shape.dims,
+                                               KaimingInitializerMode::FAN_IN);
 
   float bound = 1 / sqrtf(static_cast<float>(fan_in.int_from_positive_int()));
 
diff --git a/lib/op-attrs/src/op-attrs/ops/pool_2d.cc b/lib/op-attrs/src/op-attrs/ops/pool_2d.cc
index c542d688b3..361216cce4 100644
--- a/lib/op-attrs/src/op-attrs/ops/pool_2d.cc
+++ b/lib/op-attrs/src/op-attrs/ops/pool_2d.cc
@@ -105,9 +105,9 @@ tl::expected<Pool2DAttrs, std::string>
 }
 
 static positive_int calculate_output_size(positive_int input_size,
-                                             nonnegative_int padding_size,
-                                             positive_int kernel_size,
-                                             positive_int stride) {
+                                          nonnegative_int padding_size,
+                                          positive_int kernel_size,
+                                          positive_int stride) {
   int input_size_raw = input_size.int_from_positive_int();
   int padding_raw = padding_size.unwrap_nonnegative();
   int kernel_size_raw = kernel_size.int_from_positive_int();
diff --git a/lib/op-attrs/src/op-attrs/ops/reduction.cc b/lib/op-attrs/src/op-attrs/ops/reduction.cc
index 007559a816..580d47b1e9 100644
--- a/lib/op-attrs/src/op-attrs/ops/reduction.cc
+++ b/lib/op-attrs/src/op-attrs/ops/reduction.cc
@@ -29,10 +29,9 @@ tl::expected<ParallelTensorShape, std::string>
   }
 
   ParallelTensorShape output_shape = input_shape;
-  
+
   output_shape.dims.replica_dims.sum_degree.value = positive_int{
-    output_shape.dims.replica_dims.sum_degree.value / attrs.reduction_degree
-  };
+      output_shape.dims.replica_dims.sum_degree.value / attrs.reduction_degree};
   return output_shape;
 }
 
diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
index 8a96bc25ba..dd5230f5a4 100644
--- a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
+++ b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
@@ -19,8 +19,7 @@ FFOrdered<ShardParallelDim> ff_ordered_shard_dims(ParallelTensorDims const &d) {
   return d.shard_dims;
 }
 
-FFOrdered<positive_int>
-    ff_ordered_shard_degrees(ParallelTensorDims const &d) {
+FFOrdered<positive_int> ff_ordered_shard_degrees(ParallelTensorDims const &d) {
   return transform(d.shard_dims,
                    [](ShardParallelDim const &d) { return d.degree; });
 }
diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc
index ff6debee4f..1b8f6f1dfa 100644
--- a/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc
+++ b/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc
@@ -52,8 +52,7 @@ ShardParallelDim &shard_dim_at_idx(ParallelTensorShape &s,
   return shard_dim_at_idx(s.dims, d);
 }
 
-FFOrdered<positive_int>
-    ff_ordered_shard_degrees(ParallelTensorShape const &s) {
+FFOrdered<positive_int> ff_ordered_shard_degrees(ParallelTensorShape const &s) {
   return ff_ordered_shard_degrees(s.dims);
 }
 
@@ -133,8 +132,8 @@ ParallelDim get_parallel_dim_at_idx(ParallelTensorShape const &shape,
       [&](ReplicaType replica_type) {
         ReplicaParallelDimSet replicas = shape.dims.replica_dims;
         positive_int degree = (ReplicaType::SUM == replica_type
-                                      ? replicas.sum_degree.value
-                                      : replicas.discard_copy_degree.value);
+                                   ? replicas.sum_degree.value
+                                   : replicas.discard_copy_degree.value);
         return ParallelDim{ReplicaParallelDim{degree, replica_type}};
       }});
 }
diff --git a/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc b/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc
index 41fb988bf7..871a39f91f 100644
--- a/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc
+++ b/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc
@@ -8,7 +8,7 @@ ReplicaParallelDimSet empty_replica_parallel_dim_set() {
 }
 
 positive_int get_degree_of_replica_type(ReplicaParallelDimSet const &s,
-                                           ReplicaType replica_type) {
+                                        ReplicaType replica_type) {
   switch (replica_type) {
     case ReplicaType::SUM:
       return s.sum_degree.value;
diff --git a/lib/op-attrs/test/src/op-attrs/ops/attention.cc b/lib/op-attrs/test/src/op-attrs/ops/attention.cc
index a99fe167c7..a4f8cd62fd 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/attention.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/attention.cc
@@ -188,10 +188,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                         positive_int o_seq_len,
                         positive_int o_q) {
         return lift_to_parallel_with_degrees(
-            input_q,
-            o_sum,
-            o_eq,
-            FFOrdered{o_batch, o_seq_len, o_q});
+            input_q, o_sum, o_eq, FFOrdered{o_batch, o_seq_len, o_q});
       };
 
       auto make_k = [&](SumDegree o_sum,
@@ -200,10 +197,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                         positive_int o_seq_len,
                         positive_int o_k) {
         return lift_to_parallel_with_degrees(
-            input_k,
-            o_sum,
-            o_eq,
-            FFOrdered{o_batch, o_seq_len, o_k});
+            input_k, o_sum, o_eq, FFOrdered{o_batch, o_seq_len, o_k});
       };
 
       auto make_v = [&](SumDegree o_sum,
@@ -212,10 +206,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                         positive_int o_seq_len,
                         positive_int o_v) {
         return lift_to_parallel_with_degrees(
-            input_v,
-            o_sum,
-            o_eq,
-            FFOrdered{o_batch, o_seq_len, o_v});
+            input_v, o_sum, o_eq, FFOrdered{o_batch, o_seq_len, o_v});
       };
 
       auto make_o = [&](SumDegree o_sum,
@@ -224,10 +215,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                         positive_int o_seq_len,
                         positive_int o_o) {
         return lift_to_parallel_with_degrees(
-            output,
-            o_sum,
-            o_eq,
-            FFOrdered{o_batch, o_seq_len, o_o});
+            output, o_sum, o_eq, FFOrdered{o_batch, o_seq_len, o_o});
       };
 
       auto make_w = [&](SumDegree o_sum,
@@ -242,20 +230,14 @@ TEST_SUITE(FF_TEST_SUITE) {
                                  DiscardCopyDegree o_eq,
                                  positive_int o_in_proj_channel) {
         return lift_to_parallel_with_degrees(
-            input_bias,
-            o_sum,
-            o_eq,
-            FFOrdered{o_in_proj_channel});
+            input_bias, o_sum, o_eq, FFOrdered{o_in_proj_channel});
       };
 
       auto make_output_bias = [&](SumDegree o_sum,
                                   DiscardCopyDegree o_eq,
                                   positive_int o_out_proj_channel) {
         return lift_to_parallel_with_degrees(
-            output_bias,
-            o_sum,
-            o_eq,
-            FFOrdered{o_out_proj_channel});
+            output_bias, o_sum, o_eq, FFOrdered{o_out_proj_channel});
       };
 
       SUBCASE("data parallelism") {
diff --git a/lib/op-attrs/test/src/op-attrs/ops/cast.cc b/lib/op-attrs/test/src/op-attrs/ops/cast.cc
index eeba779dfe..128d077a05 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/cast.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/cast.cc
@@ -37,10 +37,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                             positive_int o_batch,
                             positive_int o_features) {
         return lift_to_parallel_with_degrees(
-            input,
-            o_sum,
-            o_eq,
-            FFOrdered{o_batch, o_features});
+            input, o_sum, o_eq, FFOrdered{o_batch, o_features});
       };
 
       auto make_output = [&](SumDegree o_sum,
@@ -48,10 +45,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                              positive_int o_batch,
                              positive_int o_outchannels) {
         return lift_to_parallel_with_degrees(
-            output,
-            o_sum,
-            o_eq,
-            FFOrdered{o_batch, o_outchannels});
+            output, o_sum, o_eq, FFOrdered{o_batch, o_outchannels});
       };
 
       SumDegree sum_degree = SumDegree{2_p};
diff --git a/lib/op-attrs/test/src/op-attrs/ops/combine.cc b/lib/op-attrs/test/src/op-attrs/ops/combine.cc
index 07520e7cce..d8844d9b30 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/combine.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/combine.cc
@@ -37,7 +37,8 @@ TEST_SUITE(FF_TEST_SUITE) {
       tl::expected<ParallelTensorShape, std::string> correct = [&] {
         ParallelTensorShape output = input;
         positive_int old_shard_degree = output.dims.shard_dims.at(dim).degree;
-        output.dims.shard_dims.at(dim).degree = positive_int{old_shard_degree / degree};
+        output.dims.shard_dims.at(dim).degree =
+            positive_int{old_shard_degree / degree};
         return output;
       }();
 
diff --git a/lib/op-attrs/test/src/op-attrs/ops/concat.cc b/lib/op-attrs/test/src/op-attrs/ops/concat.cc
index ee1255161c..95fa7d67c7 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/concat.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/concat.cc
@@ -145,8 +145,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     TensorShape output_shape = TensorShape{
-        TensorDims{FFOrdered{
-            dim0_size, 14_p + 16_p + 18_p, dim2_size}},
+        TensorDims{FFOrdered{dim0_size, 14_p + 16_p + 18_p, dim2_size}},
         DataType::FLOAT,
     };
 
diff --git a/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc b/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc
index 67b6bbadb8..56407c03f1 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc
@@ -165,8 +165,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           kernel,
           o_sum,
           o_eq,
-          FFOrdered{
-              o_outchannels, o_inchannels, o_kernel_h, o_kernel_w});
+          FFOrdered{o_outchannels, o_inchannels, o_kernel_h, o_kernel_w});
     };
 
     auto make_bias = [&](SumDegree o_sum,
diff --git a/lib/op-attrs/test/src/op-attrs/ops/embedding.cc b/lib/op-attrs/test/src/op-attrs/ops/embedding.cc
index 7d43b45dd0..e7cc2d6420 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/embedding.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/embedding.cc
@@ -77,10 +77,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                            positive_int o_batch,
                            positive_int o_outchannels) {
       return lift_to_parallel_with_degrees(
-          output,
-          o_sum,
-          o_eq,
-          FFOrdered{o_batch, o_outchannels});
+          output, o_sum, o_eq, FFOrdered{o_batch, o_outchannels});
     };
 
     auto make_weights = [&](SumDegree o_sum,
@@ -88,10 +85,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                             positive_int o_entries,
                             positive_int o_outchannels) {
       return lift_to_parallel_with_degrees(
-          weights,
-          o_sum,
-          o_eq,
-          FFOrdered{o_entries, o_outchannels});
+          weights, o_sum, o_eq, FFOrdered{o_entries, o_outchannels});
     };
 
     SUBCASE("data parallelism") {
diff --git a/lib/op-attrs/test/src/op-attrs/ops/linear.cc b/lib/op-attrs/test/src/op-attrs/ops/linear.cc
index 1ca936738b..61934fd1fe 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/linear.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/linear.cc
@@ -131,10 +131,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                           positive_int o_extra_dim,
                           positive_int o_channel) {
       return lift_to_parallel_with_degrees(
-          input,
-          o_sum,
-          o_eq,
-          FFOrdered{o_batch, o_extra_dim, o_channel});
+          input, o_sum, o_eq, FFOrdered{o_batch, o_extra_dim, o_channel});
     };
 
     auto make_output = [&](SumDegree o_sum,
@@ -143,10 +140,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                            positive_int o_extra_dim,
                            positive_int o_channel) {
       return lift_to_parallel_with_degrees(
-          output,
-          o_sum,
-          o_eq,
-          FFOrdered{o_batch, o_extra_dim, o_channel});
+          output, o_sum, o_eq, FFOrdered{o_batch, o_extra_dim, o_channel});
     };
 
     auto make_projection = [&](SumDegree o_sum,
@@ -154,10 +148,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                                positive_int o_inchannel,
                                positive_int o_outchannel) {
       return lift_to_parallel_with_degrees(
-          projection,
-          o_sum,
-          o_eq,
-          FFOrdered{o_inchannel, o_outchannel});
+          projection, o_sum, o_eq, FFOrdered{o_inchannel, o_outchannel});
     };
 
     auto make_bias = [&](SumDegree o_sum,
diff --git a/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc b/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc
index 9a27aafa5b..fcb772d187 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc
@@ -16,8 +16,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     Activation activation = Activation::RELU;
     PoolOp op = PoolOp::AVG;
 
-    TensorDims input_dims = TensorDims{
-        FFOrdered{input_n, input_c, input_h, input_w}};
+    TensorDims input_dims =
+        TensorDims{FFOrdered{input_n, input_c, input_h, input_w}};
 
     SUBCASE("input_h divisible by output_h && input_w divisible by output_w") {
       positive_int output_h = 5_p;
diff --git a/lib/op-attrs/test/src/op-attrs/ops/reduction.cc b/lib/op-attrs/test/src/op-attrs/ops/reduction.cc
index a480c840a3..7cfe205e36 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/reduction.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/reduction.cc
@@ -35,7 +35,8 @@ TEST_SUITE(FF_TEST_SUITE) {
       tl::expected<ParallelTensorShape, std::string> correct = [&] {
         ParallelTensorShape output = input;
         positive_int old_sum_degree = output.dims.replica_dims.sum_degree.value;
-        output.dims.replica_dims.sum_degree.value = positive_int{old_sum_degree / degree};
+        output.dims.replica_dims.sum_degree.value =
+            positive_int{old_sum_degree / degree};
         return output;
       }();
 
diff --git a/lib/op-attrs/test/src/op-attrs/tensor_dims.cc b/lib/op-attrs/test/src/op-attrs/tensor_dims.cc
index 044b50fae2..7c559cf5a8 100644
--- a/lib/op-attrs/test/src/op-attrs/tensor_dims.cc
+++ b/lib/op-attrs/test/src/op-attrs/tensor_dims.cc
@@ -7,8 +7,7 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("tensor_dims_is_broadcastable_to(TensorDims, TensorDims)") {
 
-    TensorDims goal =
-        TensorDims{FFOrdered{1_p, 1_p, 4_p, 3_p}};
+    TensorDims goal = TensorDims{FFOrdered{1_p, 1_p, 4_p, 3_p}};
 
     SUBCASE("dims match") {
       bool result = tensor_dims_is_broadcastable_to(goal, goal);
@@ -27,8 +26,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("curr only needs dim expansion") {
-      TensorDims curr =
-          TensorDims{FFOrdered{1_p, 1_p, 1_p, 3_p}};
+      TensorDims curr = TensorDims{FFOrdered{1_p, 1_p, 1_p, 3_p}};
 
       bool result = tensor_dims_is_broadcastable_to(curr, goal);
       bool correct = true;
@@ -46,8 +44,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("curr needs invalid dim promotion") {
-      TensorDims curr =
-          TensorDims{FFOrdered{1_p, 1_p, 2_p, 3_p}};
+      TensorDims curr = TensorDims{FFOrdered{1_p, 1_p, 2_p, 3_p}};
 
       bool result = tensor_dims_is_broadcastable_to(curr, goal);
       bool correct = false;
@@ -56,8 +53,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("num_dims(goal) < num_dims(curr)") {
-      TensorDims curr =
-          TensorDims{FFOrdered{1_p, 1_p, 10_p, 4_p, 3_p}};
+      TensorDims curr = TensorDims{FFOrdered{1_p, 1_p, 10_p, 4_p, 3_p}};
 
       bool result = tensor_dims_is_broadcastable_to(curr, goal);
       bool correct = false;
@@ -72,8 +68,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorDims d2 = TensorDims{FFOrdered{10_p, 4_p, 1_p}};
 
     SUBCASE("has target in inputs") {
-      TensorDims d3 =
-          TensorDims{FFOrdered{1_p, 1_p, 4_p, 3_p}};
+      TensorDims d3 = TensorDims{FFOrdered{1_p, 1_p, 4_p, 3_p}};
 
       std::optional<TensorDims> result =
           get_broadcast_target_dims({d1, d2, d3});
@@ -83,8 +78,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("has no possible target") {
-      TensorDims d3 =
-          TensorDims{FFOrdered{1_p, 1_p, 1_p, 4_p}};
+      TensorDims d3 = TensorDims{FFOrdered{1_p, 1_p, 1_p, 4_p}};
 
       std::optional<TensorDims> result =
           get_broadcast_target_dims({d1, d2, d3});
@@ -94,8 +88,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("has possible target, but not in inputs") {
-      TensorDims d3 =
-          TensorDims{FFOrdered{1_p, 1_p, 1_p, 4_p, 3_p}};
+      TensorDims d3 = TensorDims{FFOrdered{1_p, 1_p, 1_p, 4_p, 3_p}};
 
       TensorDims possible_target =
           TensorDims{FFOrdered{1_p, 1_p, 10_p, 4_p, 3_p}};
diff --git a/lib/pcg/include/pcg/machine_specification.h b/lib/pcg/include/pcg/machine_specification.h
index 863d9909c0..48c6e9a7a6 100644
--- a/lib/pcg/include/pcg/machine_specification.h
+++ b/lib/pcg/include/pcg/machine_specification.h
@@ -11,9 +11,9 @@ namespace FlexFlow {
 positive_int get_num_gpus(MachineSpecification const &ms);
 positive_int get_num_cpus(MachineSpecification const &ms);
 positive_int get_num_devices(MachineSpecification const &ms,
-                                DeviceType const &device_type);
+                             DeviceType const &device_type);
 positive_int get_num_devices_per_node(MachineSpecification const &ms,
-                                         DeviceType const &device_type);
+                                      DeviceType const &device_type);
 
 bool is_valid_machine_space_coordinate(MachineSpecification const &ms,
                                        MachineSpaceCoordinate const &coord);
diff --git a/lib/pcg/src/pcg/machine_specification.cc b/lib/pcg/src/pcg/machine_specification.cc
index 08afa415af..3db949b99d 100644
--- a/lib/pcg/src/pcg/machine_specification.cc
+++ b/lib/pcg/src/pcg/machine_specification.cc
@@ -14,7 +14,7 @@ positive_int get_num_cpus(MachineSpecification const &ms) {
 }
 
 positive_int get_num_devices(MachineSpecification const &ms,
-                                DeviceType const &device_type) {
+                             DeviceType const &device_type) {
   switch (device_type) {
     case DeviceType::GPU:
       return get_num_gpus(ms);
@@ -26,7 +26,7 @@ positive_int get_num_devices(MachineSpecification const &ms,
 }
 
 positive_int get_num_devices_per_node(MachineSpecification const &ms,
-                                         DeviceType const &device_type) {
+                                      DeviceType const &device_type) {
   switch (device_type) {
     case DeviceType::GPU:
       return ms.num_gpus_per_node;
diff --git a/lib/pcg/src/pcg/machine_view.cc b/lib/pcg/src/pcg/machine_view.cc
index 3afa73ca62..0fbb021a55 100644
--- a/lib/pcg/src/pcg/machine_view.cc
+++ b/lib/pcg/src/pcg/machine_view.cc
@@ -105,8 +105,8 @@ std::optional<MachineSpaceCoordinate> get_machine_space_coordinate(
               return mv_strides.at(i.unwrap_nonnegative()).unwrapped;
             });
 
-        std::vector<positive_int> coeffs = scanl(
-            sizes, 1_p, std::multiplies<positive_int>());
+        std::vector<positive_int> coeffs =
+            scanl(sizes, 1_p, std::multiplies<positive_int>());
 
         nonnegative_int index = start_idx;
         for (auto [coeff, coord_point, stride] :
diff --git a/lib/pcg/src/pcg/operator_task_space.cc b/lib/pcg/src/pcg/operator_task_space.cc
index 36ad43f3d3..d612680de6 100644
--- a/lib/pcg/src/pcg/operator_task_space.cc
+++ b/lib/pcg/src/pcg/operator_task_space.cc
@@ -24,7 +24,8 @@ std::unordered_set<TaskSpaceCoordinate>
 
   std::vector<std::vector<nonnegative_int>> coordinate_ranges =
       transform(task.degrees, [&](positive_int num_points) {
-        return nonnegative_range(num_points.nonnegative_int_from_positive_int());
+        return nonnegative_range(
+            num_points.nonnegative_int_from_positive_int());
       });
 
   std::unordered_set<std::vector<nonnegative_int>> raw_coordinates =
diff --git a/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc b/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc
index 1568b73162..cf5a1e17f9 100644
--- a/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc
+++ b/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc
@@ -33,8 +33,7 @@ PCGOperatorAttrs materialize_operator_from_attrs_map(
   switch (op_type) {
     case OperatorType::MULTIHEAD_ATTENTION:
       return PCGOperatorAttrs{MultiHeadAttentionAttrs{
-          /*embed_dim=*/acc.get<positive_int>(
-              OperatorAttributeKey::EMBED_DIM),
+          /*embed_dim=*/acc.get<positive_int>(OperatorAttributeKey::EMBED_DIM),
           /*num_heads=*/
           acc.get<positive_int>(OperatorAttributeKey::NUM_HEADS),
           /*kdim=*/acc.get<positive_int>(OperatorAttributeKey::KDIM),
diff --git a/lib/task-spec/include/task-spec/generic_task_impl_function.h b/lib/task-spec/include/task-spec/generic_task_impl_function.h
index b02f4d6beb..31bf132e4f 100644
--- a/lib/task-spec/include/task-spec/generic_task_impl_function.h
+++ b/lib/task-spec/include/task-spec/generic_task_impl_function.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_GENERIC_TASK_IMPL_FUNCTION_H
 #define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_GENERIC_TASK_IMPL_FUNCTION_H
 
-#include "task-spec/task_argument_accessor.h"
 #include "task-spec/device_specific_device_states.dtg.h"
+#include "task-spec/task_argument_accessor.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/init_op_task_impl_function.h b/lib/task-spec/include/task-spec/init_op_task_impl_function.h
index f98e972df8..f82d249df1 100644
--- a/lib/task-spec/include/task-spec/init_op_task_impl_function.h
+++ b/lib/task-spec/include/task-spec/init_op_task_impl_function.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_INIT_OP_TASK_IMPL_FUNCTION_H
 #define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_INIT_OP_TASK_IMPL_FUNCTION_H
 
-#include "task-spec/task_argument_accessor.h"
 #include "task-spec/device_specific_device_states.dtg.h"
+#include "task-spec/task_argument_accessor.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/itask_argument_accessor.h b/lib/task-spec/include/task-spec/itask_argument_accessor.h
index 1424b09b84..e7d1a81760 100644
--- a/lib/task-spec/include/task-spec/itask_argument_accessor.h
+++ b/lib/task-spec/include/task-spec/itask_argument_accessor.h
@@ -2,9 +2,9 @@
 #define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_ITASK_ARGUMENT_ACCESSOR_H
 
 #include "kernels/allocation.h"
-#include "task-spec/privilege_tensor_accessor.h"
 #include "task-spec/concrete_arg.h"
 #include "task-spec/op_task_signature.h"
+#include "task-spec/privilege_tensor_accessor.h"
 #include "task-spec/tensor_type.dtg.h"
 
 namespace FlexFlow {
diff --git a/lib/task-spec/include/task-spec/ops/attention.h b/lib/task-spec/include/task-spec/ops/attention.h
index 9b0179eeac..a8a444c9bf 100644
--- a/lib/task-spec/include/task-spec/ops/attention.h
+++ b/lib/task-spec/include/task-spec/ops/attention.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_ATTENTION_H
 #define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_ATTENTION_H
 
-#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/attention.h"
 #include "task-spec/op_task_invocation.h"
+#include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/ops/batch_matmul.h b/lib/task-spec/include/task-spec/ops/batch_matmul.h
index e0dc01d3f1..a50d1889e1 100644
--- a/lib/task-spec/include/task-spec/ops/batch_matmul.h
+++ b/lib/task-spec/include/task-spec/ops/batch_matmul.h
@@ -1,10 +1,10 @@
 #ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_BATCH_MATMUL_H
 #define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_BATCH_MATMUL_H
 
-#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/batch_matmul_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
 #include "task-spec/op_task_signature.h"
+#include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/ops/batch_norm.h b/lib/task-spec/include/task-spec/ops/batch_norm.h
index 081b60318f..bab6a4404a 100644
--- a/lib/task-spec/include/task-spec/ops/batch_norm.h
+++ b/lib/task-spec/include/task-spec/ops/batch_norm.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_BATCH_NORM_H
 #define _FLEXFLOW_BATCH_NORM_H
 
-#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/batch_norm_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
+#include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/ops/cast.h b/lib/task-spec/include/task-spec/ops/cast.h
index 990624b0e3..dadc8f8c74 100644
--- a/lib/task-spec/include/task-spec/ops/cast.h
+++ b/lib/task-spec/include/task-spec/ops/cast.h
@@ -15,9 +15,9 @@
 #ifndef _FLEXFLOW_CAST_H
 #define _FLEXFLOW_CAST_H
 
-#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/cast_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
+#include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/ops/combine.h b/lib/task-spec/include/task-spec/ops/combine.h
index be16379f36..ea7b3ed365 100644
--- a/lib/task-spec/include/task-spec/ops/combine.h
+++ b/lib/task-spec/include/task-spec/ops/combine.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_COMBINE_H
 #define _FLEXFLOW_COMBINE_H
 
-#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/combine_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
+#include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/ops/concat.h b/lib/task-spec/include/task-spec/ops/concat.h
index 6c7adf76ea..4e7cfef629 100644
--- a/lib/task-spec/include/task-spec/ops/concat.h
+++ b/lib/task-spec/include/task-spec/ops/concat.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_CONCAT_H
 #define _FLEXFLOW_CONCAT_H
 
-#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/concat_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
+#include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/ops/conv_2d.h b/lib/task-spec/include/task-spec/ops/conv_2d.h
index b7fda64961..1efb165d55 100644
--- a/lib/task-spec/include/task-spec/ops/conv_2d.h
+++ b/lib/task-spec/include/task-spec/ops/conv_2d.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_CONV_2D_H
 #define _FLEXFLOW_CONV_2D_H
 
-#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/conv_2d_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
+#include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/ops/dropout.h b/lib/task-spec/include/task-spec/ops/dropout.h
index 1801b63123..931e3e591e 100644
--- a/lib/task-spec/include/task-spec/ops/dropout.h
+++ b/lib/task-spec/include/task-spec/ops/dropout.h
@@ -1,10 +1,10 @@
 #ifndef _FLEXFLOW_DROPOUT_H
 #define _FLEXFLOW_DROPOUT_H
 
-#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/dropout_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
 #include "task-spec/task_id_t.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/ops/element_binary.h b/lib/task-spec/include/task-spec/ops/element_binary.h
index 57af54522d..2bd8c5dde7 100644
--- a/lib/task-spec/include/task-spec/ops/element_binary.h
+++ b/lib/task-spec/include/task-spec/ops/element_binary.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_ELEMENT_BINARY_H
 #define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_ELEMENT_BINARY_H
 
+#include "op-attrs/ops/element_binary_attrs.dtg.h"
 #include "task-spec/task_impl_function.dtg.h"
 #include "task-spec/task_signature_impl.h"
-#include "op-attrs/ops/element_binary_attrs.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/ops/element_unary.h b/lib/task-spec/include/task-spec/ops/element_unary.h
index f6dcd41455..5c88871ee7 100644
--- a/lib/task-spec/include/task-spec/ops/element_unary.h
+++ b/lib/task-spec/include/task-spec/ops/element_unary.h
@@ -1,9 +1,9 @@
 #ifndef _ELEMENT_UNARY_H
 #define _ELEMENT_UNARY_H
 
-#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/element_unary_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
+#include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/ops/embedding.h b/lib/task-spec/include/task-spec/ops/embedding.h
index 3a80d38398..27ade01cfa 100644
--- a/lib/task-spec/include/task-spec/ops/embedding.h
+++ b/lib/task-spec/include/task-spec/ops/embedding.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_EMBEDDING_H
 #define _FLEXFLOW_EMBEDDING_H
 
-#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/embedding_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
+#include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/ops/flat.h b/lib/task-spec/include/task-spec/ops/flat.h
index 6ac72ccd6b..3a02965d3b 100644
--- a/lib/task-spec/include/task-spec/ops/flat.h
+++ b/lib/task-spec/include/task-spec/ops/flat.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_FLAT_H
 #define _FLEXFLOW_FLAT_H
 
-#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/flat_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
+#include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/ops/gather.h b/lib/task-spec/include/task-spec/ops/gather.h
index c5ccc4ccdb..f800173f20 100644
--- a/lib/task-spec/include/task-spec/ops/gather.h
+++ b/lib/task-spec/include/task-spec/ops/gather.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_GATHER_H
 #define _FLEXFLOW_GATHER_H
 
-#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/gather_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
+#include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/ops/layer_norm.h b/lib/task-spec/include/task-spec/ops/layer_norm.h
index 81af0c360f..ad418826f2 100644
--- a/lib/task-spec/include/task-spec/ops/layer_norm.h
+++ b/lib/task-spec/include/task-spec/ops/layer_norm.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_RUNTIME_SRC_OPS_LAYER_NORM_H
 #define _FLEXFLOW_RUNTIME_SRC_OPS_LAYER_NORM_H
 
-#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/layer_norm_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
+#include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/ops/linear.h b/lib/task-spec/include/task-spec/ops/linear.h
index 69197fd627..d3c188a2c4 100644
--- a/lib/task-spec/include/task-spec/ops/linear.h
+++ b/lib/task-spec/include/task-spec/ops/linear.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_LINEAR_H
 #define _FLEXFLOW_LINEAR_H
 
-#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/linear_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
+#include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/ops/pool_2d.h b/lib/task-spec/include/task-spec/ops/pool_2d.h
index a3601e8800..fbecd0e96f 100644
--- a/lib/task-spec/include/task-spec/ops/pool_2d.h
+++ b/lib/task-spec/include/task-spec/ops/pool_2d.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_POOL_2D_H
 #define _FLEXFLOW_POOL_2D_H
 
-#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/pool_2d_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
+#include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/ops/reduce.h b/lib/task-spec/include/task-spec/ops/reduce.h
index e44c0f283f..ffcf66e752 100644
--- a/lib/task-spec/include/task-spec/ops/reduce.h
+++ b/lib/task-spec/include/task-spec/ops/reduce.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H
 #define _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H
 
-#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/reduce_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
+#include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/ops/reduction.h b/lib/task-spec/include/task-spec/ops/reduction.h
index cba90c37bb..5ddf292672 100644
--- a/lib/task-spec/include/task-spec/ops/reduction.h
+++ b/lib/task-spec/include/task-spec/ops/reduction.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_REDUCTION_H
 #define _FLEXFLOW_REDUCTION_H
 
-#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/reduction_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
+#include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/ops/repartition.h b/lib/task-spec/include/task-spec/ops/repartition.h
index f43cf13179..dfc42c54e5 100644
--- a/lib/task-spec/include/task-spec/ops/repartition.h
+++ b/lib/task-spec/include/task-spec/ops/repartition.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_PARTITION_H
 #define _FLEXFLOW_PARTITION_H
 
-#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/repartition_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
+#include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/ops/replicate.h b/lib/task-spec/include/task-spec/ops/replicate.h
index 0086dad741..18f6f74b19 100644
--- a/lib/task-spec/include/task-spec/ops/replicate.h
+++ b/lib/task-spec/include/task-spec/ops/replicate.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_REPLICATE_H
 #define _FLEXFLOW_REPLICATE_H
 
-#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/replicate_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
+#include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/ops/reshape.h b/lib/task-spec/include/task-spec/ops/reshape.h
index f192d83b9a..29d29ae84c 100644
--- a/lib/task-spec/include/task-spec/ops/reshape.h
+++ b/lib/task-spec/include/task-spec/ops/reshape.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_RESHAPE_H
 #define _FLEXFLOW_RESHAPE_H
 
-#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/reshape_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
+#include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/ops/reverse.h b/lib/task-spec/include/task-spec/ops/reverse.h
index bb123b63f5..7c91f91c0b 100644
--- a/lib/task-spec/include/task-spec/ops/reverse.h
+++ b/lib/task-spec/include/task-spec/ops/reverse.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_REVERSE_H_
 #define _FLEXFLOW_REVERSE_H_
 
-#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/reverse_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
+#include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/ops/softmax.h b/lib/task-spec/include/task-spec/ops/softmax.h
index 528dd5da0b..8f99c2658a 100644
--- a/lib/task-spec/include/task-spec/ops/softmax.h
+++ b/lib/task-spec/include/task-spec/ops/softmax.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_SOFTMAX_H
 #define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_SOFTMAX_H
 
-#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/softmax_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
+#include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/ops/split.h b/lib/task-spec/include/task-spec/ops/split.h
index ed92f2925e..1aa8609011 100644
--- a/lib/task-spec/include/task-spec/ops/split.h
+++ b/lib/task-spec/include/task-spec/ops/split.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_SPLIT_H
 #define _FLEXFLOW_SPLIT_H
 
-#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/split_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
+#include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/ops/topk.h b/lib/task-spec/include/task-spec/ops/topk.h
index 8afe98d568..33f2dbc5d7 100644
--- a/lib/task-spec/include/task-spec/ops/topk.h
+++ b/lib/task-spec/include/task-spec/ops/topk.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_TOPK_H_
 #define _FLEXFLOW_TOPK_H_
 
-#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/topk_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
+#include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/ops/transpose.h b/lib/task-spec/include/task-spec/ops/transpose.h
index dec29f4b36..7762f440cd 100644
--- a/lib/task-spec/include/task-spec/ops/transpose.h
+++ b/lib/task-spec/include/task-spec/ops/transpose.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_TRANSPOSE_H_
 #define _FLEXFLOW_TRANSPOSE_H_
 
-#include "task-spec/task_impl_function.dtg.h"
 #include "op-attrs/ops/transpose_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
+#include "task-spec/task_impl_function.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/task_argument_accessor.h b/lib/task-spec/include/task-spec/task_argument_accessor.h
index 2cac3a5dd8..c1c42e09a3 100644
--- a/lib/task-spec/include/task-spec/task_argument_accessor.h
+++ b/lib/task-spec/include/task-spec/task_argument_accessor.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TASK_ARGUMENT_ACCESSOR_H
 #define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TASK_ARGUMENT_ACCESSOR_H
 
-#include "task-spec/itask_argument_accessor.h"
 #include "task-spec/device_specific.h"
+#include "task-spec/itask_argument_accessor.h"
 #include "task-spec/per_device_op_state.dtg.h"
 
 namespace FlexFlow {
diff --git a/lib/task-spec/include/task-spec/task_signature_impl.h b/lib/task-spec/include/task-spec/task_signature_impl.h
index ee093c7d23..fcf9b346cf 100644
--- a/lib/task-spec/include/task-spec/task_signature_impl.h
+++ b/lib/task-spec/include/task-spec/task_signature_impl.h
@@ -1,10 +1,10 @@
 #ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TASK_SIGNATURE_IMPL_H
 #define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TASK_SIGNATURE_IMPL_H
 
-#include "task-spec/task_signature_impl.dtg.h"
 #include "op-attrs/computation_graph_op_attrs.h"
 #include "task-spec/op_task_invocation.h"
 #include "task-spec/task_id_t.dtg.h"
+#include "task-spec/task_signature_impl.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/src/task-spec/ops/layer_norm.cc b/lib/task-spec/src/task-spec/ops/layer_norm.cc
index 7e6c5062e2..8db2281bcf 100644
--- a/lib/task-spec/src/task-spec/ops/layer_norm.cc
+++ b/lib/task-spec/src/task-spec/ops/layer_norm.cc
@@ -129,7 +129,8 @@ static DeviceSpecificDeviceStates
     num_replicas *= input.shape.at(legion_dim_t{i});
   }
   positive_int effective_num_elements = M;
-  positive_int effective_batch_size = positive_int{input.shape.num_elements() / M};
+  positive_int effective_batch_size =
+      positive_int{input.shape.num_elements() / M};
 
   LayerNormPerDeviceState per_device_state =
       init_kernel(handle,
diff --git a/lib/task-spec/src/task-spec/ops/linear.cc b/lib/task-spec/src/task-spec/ops/linear.cc
index 3bf8080877..5e56ccdc1b 100644
--- a/lib/task-spec/src/task-spec/ops/linear.cc
+++ b/lib/task-spec/src/task-spec/ops/linear.cc
@@ -1,7 +1,7 @@
 #include "task-spec/ops/linear.h"
 #include "kernels/linear_kernels.h"
-#include "task-spec/task_argument_accessor.h"
 #include "op-attrs/ff_dim_t.h"
+#include "task-spec/task_argument_accessor.h"
 #include "utils/exception.h"
 #include "utils/hash-utils.h"
 
diff --git a/lib/task-spec/src/task-spec/ops/split.cc b/lib/task-spec/src/task-spec/ops/split.cc
index aa3184c999..145a9b58a3 100644
--- a/lib/task-spec/src/task-spec/ops/split.cc
+++ b/lib/task-spec/src/task-spec/ops/split.cc
@@ -48,7 +48,8 @@ static std::pair<positive_int, positive_int>
     calc_block_size(ArrayShape const &array_shape, ff_dim_t axis) {
   positive_int num_blocks = 1_p;
   positive_int block_size = 1_p;
-  for (nonnegative_int d : nonnegative_range(array_shape.num_elements().nonnegative_int_from_positive_int())) {
+  for (nonnegative_int d : nonnegative_range(
+           array_shape.num_elements().nonnegative_int_from_positive_int())) {
     if (d <= axis.value) {
       block_size *= array_shape.at(legion_dim_t{d});
     } else {
diff --git a/lib/task-spec/src/task-spec/ops/topk.cc b/lib/task-spec/src/task-spec/ops/topk.cc
index ea2d855bf6..bdf92d8487 100644
--- a/lib/task-spec/src/task-spec/ops/topk.cc
+++ b/lib/task-spec/src/task-spec/ops/topk.cc
@@ -104,7 +104,8 @@ static std::optional<float>
   auto indices = acc.get_tensor<Permissions::RO>(INDICES);
 
   positive_int length = input_grad.shape.at(legion_dim_t{0_n});
-  positive_int batch_size = positive_int{input_grad.shape.num_elements() / length};
+  positive_int batch_size =
+      positive_int{input_grad.shape.num_elements() / length};
 
   return profile(backward_kernel,
                  profiling,
diff --git a/lib/task-spec/test/src/task-spec/arg_ref.cc b/lib/task-spec/test/src/task-spec/arg_ref.cc
index dcc2e9e580..5c331a1d71 100644
--- a/lib/task-spec/test/src/task-spec/arg_ref.cc
+++ b/lib/task-spec/test/src/task-spec/arg_ref.cc
@@ -1,22 +1,22 @@
-#include <doctest/doctest.h>
 #include "task-spec/arg_ref.h"
+#include <doctest/doctest.h>
 #include <string>
 
 using namespace ::FlexFlow;
 
-enum class ExampleLabelType { 
+enum class ExampleLabelType {
   STRING,
 };
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ArgRefSpec::holds") {
-    ArgRefSpec<ExampleLabelType> arg_ref_spec = ArgRefSpec<ExampleLabelType>::create(
-      ArgRef<ExampleLabelType, std::string>{ExampleLabelType::STRING}
-    );
+    ArgRefSpec<ExampleLabelType> arg_ref_spec =
+        ArgRefSpec<ExampleLabelType>::create(
+            ArgRef<ExampleLabelType, std::string>{ExampleLabelType::STRING});
 
     SUBCASE("returns true if the type matches the ArgRef type") {
       bool result = arg_ref_spec.holds<std::string>();
-      bool correct = true; 
+      bool correct = true;
 
       CHECK(result == correct);
     }
diff --git a/lib/utils/src/utils/positive_int/positive_int.cc b/lib/utils/src/utils/positive_int/positive_int.cc
index 70233e74d8..93d4d17148 100644
--- a/lib/utils/src/utils/positive_int/positive_int.cc
+++ b/lib/utils/src/utils/positive_int/positive_int.cc
@@ -3,21 +3,16 @@
 
 namespace FlexFlow {
 
-positive_int::positive_int(int value) 
-  : value_(value)
-{
+positive_int::positive_int(int value) : value_(value) {
   this->check_invariant();
 }
 
-positive_int::positive_int(size_t value)
-  : value_(value)
-{
+positive_int::positive_int(size_t value) : value_(value) {
   this->check_invariant();
 }
 
 positive_int::positive_int(nonnegative_int value)
-  : value_(value.unwrap_nonnegative())
-{
+    : value_(value.unwrap_nonnegative()) {
   this->check_invariant();
 }
 
@@ -196,7 +191,6 @@ nonnegative_int positive_int::operator*(nonnegative_int other) const {
   return other * *this;
 }
 
-
 nonnegative_int operator*(nonnegative_int lhs, positive_int rhs) {
   return lhs * rhs.nonnegative_int_from_positive_int();
 }
@@ -247,10 +241,10 @@ void positive_int::check_invariant() const {
 }
 
 positive_int operator""_p(unsigned long long int x) {
-  ASSERT(x <= static_cast<unsigned long long int>(std::numeric_limits<int>::max()));
+  ASSERT(x <=
+         static_cast<unsigned long long int>(std::numeric_limits<int>::max()));
 
   return positive_int{static_cast<int>(x)};
-
 }
 
 } // namespace FlexFlow
@@ -268,8 +262,7 @@ void adl_serializer<::FlexFlow::positive_int>::to_json(
 } // namespace nlohmann
 
 namespace rc {
-Gen<::FlexFlow::positive_int>
-    Arbitrary<::FlexFlow::positive_int>::arbitrary() {
+Gen<::FlexFlow::positive_int> Arbitrary<::FlexFlow::positive_int>::arbitrary() {
   return gen::construct<::FlexFlow::positive_int>(gen::positive<int>());
 }
 } // namespace rc
diff --git a/lib/utils/test/src/utils/containers/sum.cc b/lib/utils/test/src/utils/containers/sum.cc
index 2e335b1051..2beaee6526 100644
--- a/lib/utils/test/src/utils/containers/sum.cc
+++ b/lib/utils/test/src/utils/containers/sum.cc
@@ -1,7 +1,7 @@
 #include "utils/containers/sum.h"
+#include "utils/positive_int/positive_int.h"
 #include <doctest/doctest.h>
 #include <vector>
-#include "utils/positive_int/positive_int.h"
 
 using namespace ::FlexFlow;
 
@@ -28,16 +28,17 @@ TEST_SUITE(FF_TEST_SUITE) {
 
   TEST_CASE("sum(std::vector<positive_int>)") {
     SUBCASE("returns the sum if the input is not empty") {
-      std::vector<positive_int> input = {3_p, 9_p, 3_p}; 
+      std::vector<positive_int> input = {3_p, 9_p, 3_p};
 
       positive_int result = sum(input);
       positive_int correct = 15_p;
 
       CHECK(result == correct);
-    } 
+    }
 
-    SUBCASE("throws an error if the input is empty, as then 0 should be returned") {
-      std::vector<positive_int> input = {}; 
+    SUBCASE(
+        "throws an error if the input is empty, as then 0 should be returned") {
+      std::vector<positive_int> input = {};
 
       CHECK_THROWS(sum(input));
     }
diff --git a/lib/utils/test/src/utils/positive_int/positive_int.cc b/lib/utils/test/src/utils/positive_int/positive_int.cc
index 25348d34da..d35ea83aad 100644
--- a/lib/utils/test/src/utils/positive_int/positive_int.cc
+++ b/lib/utils/test/src/utils/positive_int/positive_int.cc
@@ -1,5 +1,5 @@
-#include <doctest/doctest.h>
 #include "utils/positive_int/positive_int.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 

From cebd06cdb397b604c4bc9a0dc5a4ec0e92c16996 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Wed, 30 Apr 2025 09:10:15 -0700
Subject: [PATCH 75/91] Merge branch 'master' into local-e2e-training

---
 .github/runs-on.yml                           |  12 +
 .github/workflows/tests.yml                   |   2 +-
 .../unlabelled/find_pattern_matches.cc        |  34 +-
 .../unlabelled/pattern_matching.cc            |  33 +-
 .../test/src/substitutions/pcg_pattern.cc     | 394 ++++++++++++------
 .../algorithms/get_subgraph.h                 |  12 +
 .../algorithms/get_subgraph.cc                | 165 ++++----
 .../graph/open_dataflow_graph/get_subgraph.cc | 349 ++++++++++++++++
 8 files changed, 769 insertions(+), 232 deletions(-)
 create mode 100644 lib/utils/test/src/utils/graph/open_dataflow_graph/get_subgraph.cc

diff --git a/.github/runs-on.yml b/.github/runs-on.yml
index 75038549ab..a4fff33536 100644
--- a/.github/runs-on.yml
+++ b/.github/runs-on.yml
@@ -12,3 +12,15 @@ images:
     arch: "x64"
     owner: "135269210855" # runs-on
     name: "runs-on-v2.2-ubuntu22-full-x64-20250220122045"
+
+  official-ubuntu-ami:
+    platform: "linux"
+    arch: "x64"
+    ami: "ami-0a60b027285c0d4c5"
+
+  flexflow-gpu-ci:
+    platform: "linux"
+    arch: "x64"
+    owner: "409719625166" # flexflow
+    name: "flexflow-gpu-ci"
+
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 861fcc1ea7..9d98fb07dd 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -7,7 +7,7 @@ concurrency:
 jobs:
   cpu-ci:
     name: CPU unit tests and build
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-24.04
 
     steps:
       - name: Checkout Git Repository
diff --git a/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc b/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc
index a7ebc0bff7..9d8e4bc259 100644
--- a/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc
+++ b/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc
@@ -11,6 +11,7 @@
 #include "utils/graph/dataflow_graph/algorithms.h"
 #include "utils/graph/node/algorithms.h"
 #include "utils/graph/open_dataflow_graph/algorithms/get_inputs.h"
+#include "utils/overload.h"
 
 namespace FlexFlow {
 
@@ -67,6 +68,27 @@ static std::optional<UnlabelledDataflowGraphPatternMatch>
   return match;
 }
 
+MatchAdditionalCriterion additional_criterion_for_subpattern(
+    MatchAdditionalCriterion const &full_additional_criterion,
+    bidict<PatternValue, PatternInput> const
+        &full_pattern_values_to_subpattern_inputs) {
+  return MatchAdditionalCriterion{
+      full_additional_criterion.node_criterion,
+      [&](PatternValue const &patternValue, OpenDataflowValue const &pcgValue) {
+        return patternValue.visit<bool>(
+            overload{[&](PatternNodeOutput const &) -> bool {
+                       return full_additional_criterion.value_criterion(
+                           patternValue, pcgValue);
+                     },
+                     [&](PatternInput const &i) -> bool {
+                       PatternValue full_pattern_value =
+                           full_pattern_values_to_subpattern_inputs.at_r(i);
+                       return full_additional_criterion.value_criterion(
+                           full_pattern_value, pcgValue);
+                     }});
+      }};
+}
+
 std::vector<UnlabelledDataflowGraphPatternMatch>
     find_pattern_matches(UnlabelledGraphPattern const &pattern,
                          OpenDataflowGraphView const &graph,
@@ -87,10 +109,18 @@ std::vector<UnlabelledDataflowGraphPatternMatch>
     PatternSplitResult subpatterns = apply_split(pattern, split);
     std::vector<UnlabelledDataflowGraphPatternMatch> prefix_matches =
         find_pattern_matches(
-            subpatterns.subpattern_1, graph, additional_criterion);
+            subpatterns.subpattern_1,
+            graph,
+            additional_criterion_for_subpattern(
+                additional_criterion,
+                subpatterns.full_pattern_values_to_subpattern_1_inputs));
     std::vector<UnlabelledDataflowGraphPatternMatch> postfix_matches =
         find_pattern_matches(
-            subpatterns.subpattern_2, graph, additional_criterion);
+            subpatterns.subpattern_2,
+            graph,
+            additional_criterion_for_subpattern(
+                additional_criterion,
+                subpatterns.full_pattern_values_to_subpattern_2_inputs));
 
     for (UnlabelledDataflowGraphPatternMatch const &prefix_match :
          prefix_matches) {
diff --git a/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc b/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc
index 304bb8cf46..c7b03e24f2 100644
--- a/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc
+++ b/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc
@@ -7,10 +7,13 @@
 #include "substitutions/unlabelled/unlabelled_graph_pattern.h"
 #include "utils/bidict/algorithms/left_entries.h"
 #include "utils/bidict/algorithms/right_entries.h"
+#include "utils/containers/is_subseteq_of.h"
 #include "utils/containers/keys.h"
 #include "utils/containers/transform.h"
+#include "utils/containers/values.h"
 #include "utils/graph/dataflow_graph/algorithms.h"
 #include "utils/graph/node/algorithms.h"
+#include "utils/graph/open_dataflow_graph/algorithms/as_dot.h"
 #include "utils/graph/open_dataflow_graph/algorithms/get_edges.h"
 #include "utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_values.h"
 #include "utils/graph/open_dataflow_graph/algorithms/get_subgraph.h"
@@ -18,6 +21,7 @@
 #include "utils/graph/open_dataflow_graph/open_dataflow_edge.dtg.h"
 #include "utils/graph/open_dataflow_graph/open_dataflow_edge.h"
 #include "utils/overload.h"
+#include <libassert/assert.hpp>
 #include <memory>
 
 namespace FlexFlow {
@@ -46,8 +50,13 @@ struct SubgraphConcreteFromPattern {
   }
 
   OpenDataflowValue operator()(PatternInput const &i) const {
-    return OpenDataflowValue{full_graph_values_to_subgraph_inputs.at_l(
-        match.input_assignment.at(i))};
+    OpenDataflowValue mapped_input = match.input_assignment.at(i);
+    if (full_graph_values_to_subgraph_inputs.contains_l(mapped_input)) {
+      return OpenDataflowValue{
+          full_graph_values_to_subgraph_inputs.at_l(mapped_input)};
+    } else {
+      return mapped_input;
+    }
   }
 
   OpenDataflowEdge operator()(InputPatternEdge const &e) const {
@@ -148,11 +157,27 @@ bool unlabelled_pattern_does_match(
     UnlabelledDataflowGraphPatternMatch const &match,
     MatchAdditionalCriterion const &additional_criterion) {
 
+  std::unordered_set<OpenDataflowValue> matched_by_pattern_inputs =
+      unordered_set_of(values(match.input_assignment));
+
+  ASSERT(left_entries(match.node_assignment) == get_nodes(pattern));
+  ASSERT(
+      is_subseteq_of(right_entries(match.node_assignment), get_nodes(graph)));
+  ASSERT(keys(match.input_assignment) == get_graph_inputs(pattern));
+  ASSERT(is_subseteq_of(matched_by_pattern_inputs,
+                        get_open_dataflow_values(graph)));
+
   OpenDataflowSubgraphResult subgraph_result = subgraph_matched(graph, match);
   OpenDataflowGraphView matched_subgraph = subgraph_result.graph;
 
-  assert(left_entries(match.node_assignment) == get_nodes(pattern));
-  assert(right_entries(match.node_assignment) == get_nodes(matched_subgraph));
+  std::unordered_set<OpenDataflowValue> full_values_split_by_subgraph =
+      left_entries(subgraph_result.full_graph_values_to_subgraph_inputs);
+
+  ASSERT(right_entries(match.node_assignment) == get_nodes(matched_subgraph));
+  ASSERT(is_subseteq_of(full_values_split_by_subgraph,
+                        get_open_dataflow_values(graph)),
+         full_values_split_by_subgraph,
+         get_open_dataflow_values(graph));
 
   MatchAdditionalCriterion through_subgraph_operation =
       MatchAdditionalCriterion{
diff --git a/lib/substitutions/test/src/substitutions/pcg_pattern.cc b/lib/substitutions/test/src/substitutions/pcg_pattern.cc
index 8ba1fee873..4dbf0885dd 100644
--- a/lib/substitutions/test/src/substitutions/pcg_pattern.cc
+++ b/lib/substitutions/test/src/substitutions/pcg_pattern.cc
@@ -13,144 +13,260 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("find_pattern_matches(PCGPattern, SubParallelComputationGraph)") {
-    ParallelComputationGraphBuilder builder;
-
-    nonnegative_int batch_size = 16_n;
-    nonnegative_int batch_degree = 2_n;
-    nonnegative_int num_channels = 24_n;
-
-    TensorShape a_shape = TensorShape{
-        TensorDims{
-            FFOrdered<nonnegative_int>{
-                batch_size,
-                num_channels,
-            },
-        },
-        DataType::FLOAT,
-    };
-
-    std::string a_name = "a";
-
-    parallel_tensor_guid_t a_tensor = builder.create_input_tensor(a_shape);
-    a_tensor =
-        builder.parallel_partition(a_tensor, ff_dim_t{0_n}, batch_degree);
-
-    nonnegative_int outDim = 16_n;
-    std::string x_matmul_name = "x_matmul";
-    std::string y_matmul_name = "y_matmul";
-    parallel_tensor_guid_t t0 =
-        builder.dense(a_tensor,
-                      outDim,
-                      /*activation=*/std::nullopt,
-                      /*use_bias=*/false,
-                      DataType::FLOAT,
-                      /*kernel_initializer=*/std::nullopt,
-                      /*bias_initializer=*/std::nullopt,
-                      x_matmul_name);
-    parallel_tensor_guid_t t1 =
-        builder.dense(a_tensor,
-                      outDim,
-                      /*activation=*/std::nullopt,
-                      /*use_bias=*/false,
-                      DataType::FLOAT,
-                      /*kernel_initializer=*/std::nullopt,
-                      /*bias_initializer=*/std::nullopt,
-                      y_matmul_name);
-    parallel_tensor_guid_t t2 = builder.add(t0, t1);
-
-    ParallelComputationGraph pcg = builder.pcg;
-    parallel_layer_guid_t x_matmul =
-        get_parallel_layer_by_name(pcg, x_matmul_name);
-    parallel_layer_guid_t y_matmul =
-        get_parallel_layer_by_name(pcg, y_matmul_name);
-    std::vector<parallel_tensor_guid_t> x_incoming =
-        get_incoming_tensors(pcg, x_matmul);
-    REQUIRE(x_incoming.size() == 2);
-    parallel_tensor_guid_t x_weights = x_incoming.at(1);
-    std::vector<parallel_tensor_guid_t> y_incoming =
-        get_incoming_tensors(pcg, y_matmul);
-    REQUIRE(y_incoming.size() == 2);
-    parallel_tensor_guid_t y_weights = y_incoming.at(1);
-
-    LabelledOpenDataflowGraph<OperatorAttributePattern, TensorAttributePattern>
-        g = LabelledOpenDataflowGraph<OperatorAttributePattern,
-                                      TensorAttributePattern>::
-            create<UnorderedSetLabelledOpenDataflowGraph<
-                OperatorAttributePattern,
-                TensorAttributePattern>>();
-
-    TensorAttributePattern pattern_tensor_a =
-        tensor_attribute_pattern_match_all();
-    TensorAttributePattern pattern_tensor_b =
-        tensor_attribute_pattern_match_all();
-    TensorAttributePattern pattern_tensor_c =
-        tensor_attribute_pattern_match_all();
-    TensorAttributePattern pattern_tensor_x =
-        tensor_attribute_pattern_match_all();
-    TensorAttributePattern pattern_tensor_y =
-        tensor_attribute_pattern_match_all();
-
-    OperatorAttributePattern op_pattern_1 = OperatorAttributePattern{{
-        op_type_equals_constraint(OperatorType::LINEAR),
-    }};
-
-    OperatorAttributePattern op_pattern_2 = op_pattern_1;
-
-    DataflowGraphInput pt_a = g.add_input(pattern_tensor_a);
-    DataflowGraphInput pt_b = g.add_input(pattern_tensor_b);
-    DataflowGraphInput pt_c = g.add_input(pattern_tensor_c);
-
-    NodeAddedResult op_pattern_1_added =
-        g.add_node(op_pattern_1,
-                   {OpenDataflowValue{pt_a}, OpenDataflowValue{pt_b}},
-                   {pattern_tensor_x});
-    PatternNode op_pattern_1_node = PatternNode{op_pattern_1_added.node};
-    OpenDataflowValue pt_x =
-        OpenDataflowValue{get_only(op_pattern_1_added.outputs)};
-
-    NodeAddedResult op_pattern_2_added =
-        g.add_node(op_pattern_2,
-                   {OpenDataflowValue{pt_a}, OpenDataflowValue{pt_c}},
-                   {pattern_tensor_y});
-    PatternNode op_pattern_2_node = PatternNode{op_pattern_2_added.node};
-    OpenDataflowValue pt_y =
-        OpenDataflowValue{get_only(op_pattern_2_added.outputs)};
-
-    PCGPattern pattern = PCGPattern{g};
-
-    std::unordered_set<PCGPatternMatch> result = unordered_set_of(
-        find_pattern_matches(pattern, sub_pcg_from_full_pcg(pcg)));
-
-    PCGPatternMatch match1 =
-        PCGPatternMatch{bidict<PatternNode, parallel_layer_guid_t>{
-                            {op_pattern_1_node, x_matmul},
-                            {op_pattern_2_node, y_matmul},
-                        },
-                        bidict<PatternInput, open_parallel_tensor_guid_t>{
-                            {PatternInput{pt_a},
-                             open_parallel_tensor_guid_from_closed(a_tensor)},
-                            {PatternInput{pt_b},
-                             open_parallel_tensor_guid_from_closed(x_weights)},
-                            {PatternInput{pt_c},
-                             open_parallel_tensor_guid_from_closed(y_weights)},
-                        }};
-
-    PCGPatternMatch match2 =
-        PCGPatternMatch{bidict<PatternNode, parallel_layer_guid_t>{
-                            {op_pattern_1_node, y_matmul},
-                            {op_pattern_2_node, x_matmul},
-                        },
-                        bidict<PatternInput, open_parallel_tensor_guid_t>{
-                            {PatternInput{pt_a},
-                             open_parallel_tensor_guid_from_closed(a_tensor)},
-                            {PatternInput{pt_b},
-                             open_parallel_tensor_guid_from_closed(y_weights)},
-                            {PatternInput{pt_c},
-                             open_parallel_tensor_guid_from_closed(x_weights)},
-                        }};
-
-    std::unordered_set<PCGPatternMatch> correct = {match1, match2};
-
-    CHECK(result == correct);
+    SUBCASE("simple case") {
+      ParallelComputationGraphBuilder builder;
+
+      nonnegative_int batch_size = 16_n;
+      nonnegative_int batch_degree = 2_n;
+      nonnegative_int num_channels = 24_n;
+
+      TensorShape a_shape = TensorShape{
+          TensorDims{
+              FFOrdered<nonnegative_int>{
+                  batch_size,
+                  num_channels,
+              },
+          },
+          DataType::FLOAT,
+      };
+
+      std::string a_name = "a";
+
+      parallel_tensor_guid_t a_tensor = builder.create_input_tensor(a_shape);
+      a_tensor =
+          builder.parallel_partition(a_tensor, ff_dim_t{0_n}, batch_degree);
+
+      nonnegative_int outDim = 16_n;
+      std::string x_matmul_name = "x_matmul";
+      std::string y_matmul_name = "y_matmul";
+      parallel_tensor_guid_t t0 =
+          builder.dense(a_tensor,
+                        outDim,
+                        /*activation=*/std::nullopt,
+                        /*use_bias=*/false,
+                        DataType::FLOAT,
+                        /*kernel_initializer=*/std::nullopt,
+                        /*bias_initializer=*/std::nullopt,
+                        x_matmul_name);
+      parallel_tensor_guid_t t1 =
+          builder.dense(a_tensor,
+                        outDim,
+                        /*activation=*/std::nullopt,
+                        /*use_bias=*/false,
+                        DataType::FLOAT,
+                        /*kernel_initializer=*/std::nullopt,
+                        /*bias_initializer=*/std::nullopt,
+                        y_matmul_name);
+      parallel_tensor_guid_t t2 = builder.add(t0, t1);
+
+      ParallelComputationGraph pcg = builder.pcg;
+      parallel_layer_guid_t x_matmul =
+          get_parallel_layer_by_name(pcg, x_matmul_name);
+      parallel_layer_guid_t y_matmul =
+          get_parallel_layer_by_name(pcg, y_matmul_name);
+      std::vector<parallel_tensor_guid_t> x_incoming =
+          get_incoming_tensors(pcg, x_matmul);
+      REQUIRE(x_incoming.size() == 2);
+      parallel_tensor_guid_t x_weights = x_incoming.at(1);
+      std::vector<parallel_tensor_guid_t> y_incoming =
+          get_incoming_tensors(pcg, y_matmul);
+      REQUIRE(y_incoming.size() == 2);
+      parallel_tensor_guid_t y_weights = y_incoming.at(1);
+
+      LabelledOpenDataflowGraph<OperatorAttributePattern,
+                                TensorAttributePattern>
+          g = LabelledOpenDataflowGraph<OperatorAttributePattern,
+                                        TensorAttributePattern>::
+              create<UnorderedSetLabelledOpenDataflowGraph<
+                  OperatorAttributePattern,
+                  TensorAttributePattern>>();
+
+      TensorAttributePattern pattern_tensor_a =
+          tensor_attribute_pattern_match_all();
+      TensorAttributePattern pattern_tensor_b =
+          tensor_attribute_pattern_match_all();
+      TensorAttributePattern pattern_tensor_c =
+          tensor_attribute_pattern_match_all();
+      TensorAttributePattern pattern_tensor_x =
+          tensor_attribute_pattern_match_all();
+      TensorAttributePattern pattern_tensor_y =
+          tensor_attribute_pattern_match_all();
+
+      OperatorAttributePattern op_pattern_1 = OperatorAttributePattern{{
+          op_type_equals_constraint(OperatorType::LINEAR),
+      }};
+
+      OperatorAttributePattern op_pattern_2 = op_pattern_1;
+
+      DataflowGraphInput pt_a = g.add_input(pattern_tensor_a);
+      DataflowGraphInput pt_b = g.add_input(pattern_tensor_b);
+      DataflowGraphInput pt_c = g.add_input(pattern_tensor_c);
+
+      NodeAddedResult op_pattern_1_added =
+          g.add_node(op_pattern_1,
+                     {OpenDataflowValue{pt_a}, OpenDataflowValue{pt_b}},
+                     {pattern_tensor_x});
+      PatternNode op_pattern_1_node = PatternNode{op_pattern_1_added.node};
+      OpenDataflowValue pt_x =
+          OpenDataflowValue{get_only(op_pattern_1_added.outputs)};
+
+      NodeAddedResult op_pattern_2_added =
+          g.add_node(op_pattern_2,
+                     {OpenDataflowValue{pt_a}, OpenDataflowValue{pt_c}},
+                     {pattern_tensor_y});
+      PatternNode op_pattern_2_node = PatternNode{op_pattern_2_added.node};
+      OpenDataflowValue pt_y =
+          OpenDataflowValue{get_only(op_pattern_2_added.outputs)};
+
+      PCGPattern pattern = PCGPattern{g};
+
+      std::unordered_set<PCGPatternMatch> result = unordered_set_of(
+          find_pattern_matches(pattern, sub_pcg_from_full_pcg(pcg)));
+
+      PCGPatternMatch match1 = PCGPatternMatch{
+          bidict<PatternNode, parallel_layer_guid_t>{
+              {op_pattern_1_node, x_matmul},
+              {op_pattern_2_node, y_matmul},
+          },
+          bidict<PatternInput, open_parallel_tensor_guid_t>{
+              {PatternInput{pt_a},
+               open_parallel_tensor_guid_from_closed(a_tensor)},
+              {PatternInput{pt_b},
+               open_parallel_tensor_guid_from_closed(x_weights)},
+              {PatternInput{pt_c},
+               open_parallel_tensor_guid_from_closed(y_weights)},
+          }};
+
+      PCGPatternMatch match2 = PCGPatternMatch{
+          bidict<PatternNode, parallel_layer_guid_t>{
+              {op_pattern_1_node, y_matmul},
+              {op_pattern_2_node, x_matmul},
+          },
+          bidict<PatternInput, open_parallel_tensor_guid_t>{
+              {PatternInput{pt_a},
+               open_parallel_tensor_guid_from_closed(a_tensor)},
+              {PatternInput{pt_b},
+               open_parallel_tensor_guid_from_closed(y_weights)},
+              {PatternInput{pt_c},
+               open_parallel_tensor_guid_from_closed(x_weights)},
+          }};
+
+      std::unordered_set<PCGPatternMatch> correct = {match1, match2};
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("pcg is a chain") {
+      ParallelComputationGraphBuilder builder;
+
+      nonnegative_int batch_size = 16_n;
+      nonnegative_int batch_degree = 2_n;
+      nonnegative_int num_channels = 24_n;
+
+      TensorShape a_shape = TensorShape{
+          TensorDims{
+              FFOrdered<nonnegative_int>{
+                  batch_size,
+                  num_channels,
+              },
+          },
+          DataType::FLOAT,
+      };
+
+      std::string a_name = "a";
+
+      parallel_tensor_guid_t a_tensor = builder.create_input_tensor(a_shape);
+      a_tensor =
+          builder.parallel_partition(a_tensor, ff_dim_t{0_n}, batch_degree);
+
+      nonnegative_int outDim = 16_n;
+      std::string x_matmul_name = "x_matmul";
+      std::string y_matmul_name = "y_matmul";
+      parallel_tensor_guid_t t0 =
+          builder.dense(a_tensor,
+                        outDim,
+                        /*activation=*/std::nullopt,
+                        /*use_bias=*/false,
+                        DataType::FLOAT,
+                        /*kernel_initializer=*/std::nullopt,
+                        /*bias_initializer=*/std::nullopt,
+                        x_matmul_name);
+      parallel_tensor_guid_t t1 =
+          builder.dense(t0,
+                        outDim,
+                        /*activation=*/std::nullopt,
+                        /*use_bias=*/false,
+                        DataType::FLOAT,
+                        /*kernel_initializer=*/std::nullopt,
+                        /*bias_initializer=*/std::nullopt,
+                        y_matmul_name);
+      parallel_tensor_guid_t t2 =
+          builder.dense(t1,
+                        outDim,
+                        /*activation=*/std::nullopt,
+                        /*use_bias=*/false,
+                        DataType::FLOAT,
+                        /*kernel_initializer=*/std::nullopt,
+                        /*bias_initializer=*/std::nullopt);
+      parallel_tensor_guid_t t3 =
+          builder.dense(t2,
+                        outDim,
+                        /*activation=*/std::nullopt,
+                        /*use_bias=*/false,
+                        DataType::FLOAT,
+                        /*kernel_initializer=*/std::nullopt,
+                        /*bias_initializer=*/std::nullopt);
+      ParallelComputationGraph pcg = builder.pcg;
+
+      LabelledOpenDataflowGraph<OperatorAttributePattern,
+                                TensorAttributePattern>
+          g = LabelledOpenDataflowGraph<OperatorAttributePattern,
+                                        TensorAttributePattern>::
+              create<UnorderedSetLabelledOpenDataflowGraph<
+                  OperatorAttributePattern,
+                  TensorAttributePattern>>();
+
+      TensorAttributePattern pattern_tensor_a =
+          tensor_attribute_pattern_match_all();
+      TensorAttributePattern pattern_tensor_b =
+          tensor_attribute_pattern_match_all();
+      TensorAttributePattern pattern_tensor_c =
+          tensor_attribute_pattern_match_all();
+      TensorAttributePattern pattern_tensor_x =
+          tensor_attribute_pattern_match_all();
+      TensorAttributePattern pattern_tensor_y =
+          tensor_attribute_pattern_match_all();
+
+      OperatorAttributePattern op_pattern_1 = OperatorAttributePattern{{
+          op_type_equals_constraint(OperatorType::LINEAR),
+      }};
+
+      OperatorAttributePattern op_pattern_2 = op_pattern_1;
+
+      DataflowGraphInput pt_a = g.add_input(pattern_tensor_a);
+      DataflowGraphInput pt_b = g.add_input(pattern_tensor_b);
+      DataflowGraphInput pt_c = g.add_input(pattern_tensor_c);
+
+      NodeAddedResult op_pattern_1_added =
+          g.add_node(op_pattern_1,
+                     {OpenDataflowValue{pt_a}, OpenDataflowValue{pt_b}},
+                     {pattern_tensor_x});
+      PatternNode op_pattern_1_node = PatternNode{op_pattern_1_added.node};
+      OpenDataflowValue pt_x =
+          OpenDataflowValue{get_only(op_pattern_1_added.outputs)};
+
+      NodeAddedResult op_pattern_2_added =
+          g.add_node(op_pattern_2,
+                     {OpenDataflowValue{pt_x}, OpenDataflowValue{pt_c}},
+                     {pattern_tensor_y});
+      PatternNode op_pattern_2_node = PatternNode{op_pattern_2_added.node};
+
+      PCGPattern pattern = PCGPattern{g};
+
+      std::unordered_set<PCGPatternMatch> result = unordered_set_of(
+          find_pattern_matches(pattern, sub_pcg_from_full_pcg(pcg)));
+
+      CHECK(result.size() == 3);
+    }
   }
 }
diff --git a/lib/utils/include/utils/graph/open_dataflow_graph/algorithms/get_subgraph.h b/lib/utils/include/utils/graph/open_dataflow_graph/algorithms/get_subgraph.h
index 202058a3d1..f5bbbc228d 100644
--- a/lib/utils/include/utils/graph/open_dataflow_graph/algorithms/get_subgraph.h
+++ b/lib/utils/include/utils/graph/open_dataflow_graph/algorithms/get_subgraph.h
@@ -1,6 +1,7 @@
 #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_OPEN_DATAFLOW_GRAPH_ALGORITHMS_GET_SUBGRAPH_H
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_OPEN_DATAFLOW_GRAPH_ALGORITHMS_GET_SUBGRAPH_H
 
+#include "utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_data.dtg.h"
 #include "utils/graph/open_dataflow_graph/algorithms/open_dataflow_subgraph_result.dtg.h"
 #include "utils/graph/open_dataflow_graph/open_dataflow_graph_view.h"
 #include "utils/graph/open_dataflow_graph/open_dataflow_value.dtg.h"
@@ -10,6 +11,17 @@ namespace FlexFlow {
 OpenDataflowSubgraphResult get_subgraph(OpenDataflowGraphView const &,
                                         std::unordered_set<Node> const &);
 
+bidict<OpenDataflowValue, DataflowGraphInput>
+    get_full_graph_values_to_subgraph_inputs(
+        OpenDataflowGraphView const &g,
+        std::unordered_set<Node> const &subgraph_nodes);
+
+OpenDataflowGraphData
+    get_subgraph_data(OpenDataflowGraphView const &g,
+                      std::unordered_set<Node> const &subgraph_nodes,
+                      bidict<OpenDataflowValue, DataflowGraphInput> const
+                          &full_graph_values_to_subgraph_inputs);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_subgraph.cc b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_subgraph.cc
index ad3d4f26c0..36f027f792 100644
--- a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_subgraph.cc
+++ b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_subgraph.cc
@@ -4,7 +4,11 @@
 #include "utils/containers/is_subseteq_of.h"
 #include "utils/containers/unordered_set_of.h"
 #include "utils/containers/values.h"
+#include "utils/graph/dataflow_graph/dataflow_output_query.h"
 #include "utils/graph/node/algorithms.h"
+#include "utils/graph/open_dataflow_graph/algorithms/from_open_dataflow_graph_data.h"
+#include "utils/graph/open_dataflow_graph/algorithms/get_edges.h"
+#include "utils/graph/open_dataflow_graph/algorithms/get_subgraph_incoming_edges.h"
 #include "utils/graph/open_dataflow_graph/algorithms/get_subgraph_inputs.h"
 #include "utils/graph/open_dataflow_graph/dataflow_graph_input_source.h"
 #include "utils/graph/open_dataflow_graph/open_dataflow_value.dtg.h"
@@ -13,100 +17,89 @@
 
 namespace FlexFlow {
 
-struct OpenDataflowSubgraph final : public IOpenDataflowGraphView {
-  OpenDataflowSubgraph(OpenDataflowGraphView const &full_graph,
-                       std::unordered_set<Node> const &subgraph_nodes,
-                       bidict<OpenDataflowValue, DataflowGraphInput> const
-                           &full_graph_values_to_subgraph_inputs)
-      : full_graph(full_graph), subgraph_nodes(subgraph_nodes),
-        full_graph_values_to_subgraph_inputs(
-            full_graph_values_to_subgraph_inputs) {
-    assert(is_subseteq_of(this->subgraph_nodes, get_nodes(full_graph)));
-  }
-
-  std::unordered_set<Node> query_nodes(NodeQuery const &q) const override {
-    return intersection(this->full_graph.query_nodes(q), this->subgraph_nodes);
-  }
-
-  std::unordered_set<OpenDataflowEdge>
-      query_edges(OpenDataflowEdgeQuery const &q) const override {
-    std::unordered_set<OpenDataflowEdge> result;
-    for (OpenDataflowEdge const &open_e : this->full_graph.query_edges(q)) {
-      open_e.visit<std::nullopt_t>(overload{
-          [&](DataflowEdge const &e) {
-            bool contains_src = contains(this->subgraph_nodes, e.src.node);
-            bool contains_dst = contains(this->subgraph_nodes, e.dst.node);
-            if (contains_src && contains_dst) {
-              result.insert(OpenDataflowEdge{e});
-            } else if (contains_dst && !contains_src) {
-              result.insert(OpenDataflowEdge{DataflowInputEdge{
-                  this->full_graph_values_to_subgraph_inputs.at_l(
-                      OpenDataflowValue{e.src}),
-                  e.dst}});
-            }
-            return std::nullopt;
-          },
-          [&](DataflowInputEdge const &e) {
-            if (contains(this->subgraph_nodes, e.dst.node)) {
-              result.insert(OpenDataflowEdge{DataflowInputEdge{
-                  this->full_graph_values_to_subgraph_inputs.at_l(
-                      OpenDataflowValue{e.src}),
-                  e.dst}});
-            }
-            return std::nullopt;
-          }});
-    }
-    return result;
-  }
-
-  std::unordered_set<DataflowOutput>
-      query_outputs(DataflowOutputQuery const &q) const override {
-    return filter(this->full_graph.query_outputs(q),
-                  [&](DataflowOutput const &o) {
-                    return contains(this->subgraph_nodes, o.node);
-                  });
-  }
-
-  std::unordered_set<DataflowGraphInput> get_inputs() const override {
-    return unordered_set_of(values(this->full_graph_values_to_subgraph_inputs));
-  };
-
-  OpenDataflowSubgraph *clone() const override {
-    return new OpenDataflowSubgraph{
-        this->full_graph,
-        this->subgraph_nodes,
-        this->full_graph_values_to_subgraph_inputs,
-    };
-  }
-
-private:
-  OpenDataflowGraphView full_graph;
-  std::unordered_set<Node> subgraph_nodes;
-  bidict<OpenDataflowValue, DataflowGraphInput>
-      full_graph_values_to_subgraph_inputs;
-};
-
 OpenDataflowSubgraphResult
     get_subgraph(OpenDataflowGraphView const &g,
                  std::unordered_set<Node> const &subgraph_nodes) {
-  DataflowGraphInputSource input_source;
   bidict<OpenDataflowValue, DataflowGraphInput>
-      full_graph_values_to_subgraph_inputs = generate_bidict(
-          get_subgraph_inputs(g, subgraph_nodes),
-          [&](OpenDataflowValue const &v) -> DataflowGraphInput {
-            return v.visit<DataflowGraphInput>(overload{
-                [](DataflowGraphInput const &i) { return i; },
-                [&](DataflowOutput const &) {
-                  return input_source.new_dataflow_graph_input();
-                },
-            });
-          });
+      full_graph_values_to_subgraph_inputs =
+          get_full_graph_values_to_subgraph_inputs(g, subgraph_nodes);
 
   return OpenDataflowSubgraphResult{
-      OpenDataflowGraphView::create<OpenDataflowSubgraph>(
-          g, subgraph_nodes, full_graph_values_to_subgraph_inputs),
+      OpenDataflowGraphView::create<FromOpenDataflowGraphDataView>(
+          get_subgraph_data(
+              g, subgraph_nodes, full_graph_values_to_subgraph_inputs)),
       full_graph_values_to_subgraph_inputs,
   };
 }
 
+bidict<OpenDataflowValue, DataflowGraphInput>
+    get_full_graph_values_to_subgraph_inputs(
+        OpenDataflowGraphView const &g,
+        std::unordered_set<Node> const &subgraph_nodes) {
+  DataflowGraphInputSource input_source;
+  return generate_bidict(get_subgraph_inputs(g, subgraph_nodes),
+                         [&](OpenDataflowValue const &v) -> DataflowGraphInput {
+                           return v.visit<DataflowGraphInput>(overload{
+                               [](DataflowGraphInput const &i) { return i; },
+                               [&](DataflowOutput const &) {
+                                 return input_source.new_dataflow_graph_input();
+                               },
+                           });
+                         });
+}
+
+OpenDataflowGraphData
+    get_subgraph_data(OpenDataflowGraphView const &g,
+                      std::unordered_set<Node> const &subgraph_nodes,
+                      bidict<OpenDataflowValue, DataflowGraphInput> const
+                          &full_graph_values_to_subgraph_inputs) {
+  std::unordered_set<OpenDataflowEdge> subgraph_input_edges =
+      transform(get_subgraph_incoming_edges(g, subgraph_nodes),
+                [&](OpenDataflowEdge const &edge) {
+                  return edge.visit<OpenDataflowEdge>(
+                      overload{[&](DataflowInputEdge const &e) {
+                                 return OpenDataflowEdge{DataflowInputEdge{
+                                     full_graph_values_to_subgraph_inputs.at_l(
+                                         OpenDataflowValue{e.src}),
+                                     e.dst}};
+                               },
+                               [&](DataflowEdge const &e) {
+                                 return OpenDataflowEdge{DataflowInputEdge{
+                                     full_graph_values_to_subgraph_inputs.at_l(
+                                         OpenDataflowValue{e.src}),
+                                     e.dst}};
+                               }});
+                });
+
+  OpenDataflowEdgeQuery subgraph_interior_edges_query = OpenDataflowEdgeQuery{
+      DataflowInputEdgeQuery{
+          query_set<DataflowGraphInput>::match_none(),
+          query_set<Node>::match_none(),
+          query_set<nonnegative_int>::match_none(),
+      },
+      DataflowEdgeQuery{
+          query_set<Node>{subgraph_nodes},
+          query_set<nonnegative_int>::matchall(),
+          query_set<Node>{subgraph_nodes},
+          query_set<nonnegative_int>::matchall(),
+      },
+  };
+  std::unordered_set<OpenDataflowEdge> subgraph_interior_edges =
+      g.query_edges(subgraph_interior_edges_query);
+
+  std::unordered_set<DataflowGraphInput> subgraph_inputs =
+      unordered_set_of(values(full_graph_values_to_subgraph_inputs));
+  std::unordered_set<DataflowOutput> subgraph_outputs =
+      filter(g.query_outputs(dataflow_output_query_all()),
+             [&](DataflowOutput const &o) {
+               return contains(subgraph_nodes, o.node);
+             });
+  return OpenDataflowGraphData{
+      subgraph_nodes,
+      set_union(subgraph_input_edges, subgraph_interior_edges),
+      subgraph_inputs,
+      subgraph_outputs,
+  };
+}
+
 } // namespace FlexFlow
diff --git a/lib/utils/test/src/utils/graph/open_dataflow_graph/get_subgraph.cc b/lib/utils/test/src/utils/graph/open_dataflow_graph/get_subgraph.cc
new file mode 100644
index 0000000000..c44e5f81b7
--- /dev/null
+++ b/lib/utils/test/src/utils/graph/open_dataflow_graph/get_subgraph.cc
@@ -0,0 +1,349 @@
+#include "utils/graph/open_dataflow_graph/algorithms/get_subgraph.h"
+#include "utils/bidict/algorithms/left_entries.h"
+#include "utils/containers/contains.h"
+#include "utils/containers/get_only.h"
+#include "utils/graph/instances/unordered_set_dataflow_graph.h"
+#include "utils/graph/node/algorithms.h"
+#include "utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_values.h"
+#include "utils/graph/open_dataflow_graph/open_dataflow_graph.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("get_full_graph_values_to_subgraph_inputs(OpenDataflowGraphView, "
+            "std::unordered_set<Node>) ") {
+    OpenDataflowGraph graph =
+        OpenDataflowGraph::create<UnorderedSetDataflowGraph>();
+
+    DataflowGraphInput i0 = graph.add_input();
+    DataflowGraphInput i1 = graph.add_input();
+    DataflowGraphInput i2 = graph.add_input();
+
+    NodeAddedResult n0_added = graph.add_node({OpenDataflowValue{i0}}, 1_n);
+    Node n0 = n0_added.node;
+    OpenDataflowValue v0 = OpenDataflowValue{get_only(n0_added.outputs)};
+
+    NodeAddedResult n1_added = graph.add_node({v0, OpenDataflowValue{i1}}, 1_n);
+    Node n1 = n1_added.node;
+    OpenDataflowValue v1 = OpenDataflowValue{get_only(n1_added.outputs)};
+
+    NodeAddedResult n2_added = graph.add_node({v0}, 1_n);
+    Node n2 = n2_added.node;
+    OpenDataflowValue v2 = OpenDataflowValue{get_only(n2_added.outputs)};
+
+    NodeAddedResult n3_added =
+        graph.add_node({OpenDataflowValue{i2}, v1, v2}, 1_n);
+    Node n3 = n3_added.node;
+
+    std::unordered_set<Node> subgraph_nodes = {n1, n2, n3};
+
+    bidict<OpenDataflowValue, DataflowGraphInput>
+        full_graph_values_to_subgraph_inputs =
+            get_full_graph_values_to_subgraph_inputs(graph, subgraph_nodes);
+
+    SUBCASE("left entries are correct") {
+      std::unordered_set<OpenDataflowValue> correct = {
+          v0, OpenDataflowValue{i1}, OpenDataflowValue{i2}};
+      CHECK(left_entries(full_graph_values_to_subgraph_inputs) == correct);
+    }
+
+    SUBCASE("mapping is correct") {
+      CHECK(full_graph_values_to_subgraph_inputs.at_l(OpenDataflowValue{i1}) ==
+            i1);
+      CHECK(full_graph_values_to_subgraph_inputs.at_l(OpenDataflowValue{i2}) ==
+            i2);
+      std::unordered_set<DataflowGraphInput> inputs = {i1, i2};
+      CHECK(!contains(inputs, full_graph_values_to_subgraph_inputs.at_l(v0)));
+    }
+  }
+
+  TEST_CASE(
+      "get_subgraph_data(OpenDataflowGraphView, std::unordered_set<Node>, "
+      "bidict<OpenDataflowValue, DataflowGraphInput>)") {
+    SUBCASE("2-node graph without inputs") {
+      OpenDataflowGraph graph =
+          OpenDataflowGraph::create<UnorderedSetDataflowGraph>();
+
+      NodeAddedResult n0_added = graph.add_node({}, 1_n);
+      Node n0 = n0_added.node;
+      OpenDataflowValue v0 = OpenDataflowValue{get_only(n0_added.outputs)};
+
+      NodeAddedResult n1_added = graph.add_node({v0}, 1_n);
+      Node n1 = n1_added.node;
+
+      SUBCASE("subgraph is full graph") {
+        std::unordered_set<Node> subgraph_nodes = {n0, n1};
+
+        bidict<OpenDataflowValue, DataflowGraphInput>
+            full_graph_values_to_subgraph_inputs =
+                get_full_graph_values_to_subgraph_inputs(graph, subgraph_nodes);
+
+        OpenDataflowGraphData result = get_subgraph_data(
+            graph, subgraph_nodes, full_graph_values_to_subgraph_inputs);
+        OpenDataflowGraphData correct = OpenDataflowGraphData{
+            subgraph_nodes,
+            {OpenDataflowEdge{
+                DataflowEdge{DataflowOutput{n0, 0_n}, DataflowInput{n1, 0_n}}}},
+            {},
+            {
+                DataflowOutput{
+                    n0,
+                    0_n,
+                },
+                DataflowOutput{
+                    n1,
+                    0_n,
+                },
+            }};
+        CHECK(result == correct);
+      }
+
+      SUBCASE("subgraph is n0") {
+        std::unordered_set<Node> subgraph_nodes = {n0};
+
+        bidict<OpenDataflowValue, DataflowGraphInput>
+            full_graph_values_to_subgraph_inputs =
+                get_full_graph_values_to_subgraph_inputs(graph, subgraph_nodes);
+
+        OpenDataflowGraphData result = get_subgraph_data(
+            graph, subgraph_nodes, full_graph_values_to_subgraph_inputs);
+        OpenDataflowGraphData correct = OpenDataflowGraphData{subgraph_nodes,
+                                                              {},
+                                                              {},
+                                                              {DataflowOutput{
+                                                                  n0,
+                                                                  0_n,
+                                                              }}};
+        CHECK(result == correct);
+      }
+
+      SUBCASE("subgraph is n1") {
+        std::unordered_set<Node> subgraph_nodes = {n1};
+
+        bidict<OpenDataflowValue, DataflowGraphInput>
+            full_graph_values_to_subgraph_inputs =
+                get_full_graph_values_to_subgraph_inputs(graph, subgraph_nodes);
+
+        OpenDataflowGraphData result = get_subgraph_data(
+            graph, subgraph_nodes, full_graph_values_to_subgraph_inputs);
+
+        DataflowGraphInput n0_as_subgraph_input =
+            full_graph_values_to_subgraph_inputs.at_l(v0);
+
+        OpenDataflowGraphData correct = OpenDataflowGraphData{
+            subgraph_nodes,
+            {OpenDataflowEdge{DataflowInputEdge{n0_as_subgraph_input,
+                                                DataflowInput{n1, 0_n}}}},
+            {n0_as_subgraph_input},
+            {DataflowOutput{
+                n1,
+                0_n,
+            }}};
+        CHECK(result == correct);
+      }
+
+      SUBCASE("subgraph is empty") {
+        std::unordered_set<Node> subgraph_nodes = {};
+
+        bidict<OpenDataflowValue, DataflowGraphInput>
+            full_graph_values_to_subgraph_inputs =
+                get_full_graph_values_to_subgraph_inputs(graph, subgraph_nodes);
+
+        OpenDataflowGraphData result = get_subgraph_data(
+            graph, subgraph_nodes, full_graph_values_to_subgraph_inputs);
+        OpenDataflowGraphData correct =
+            OpenDataflowGraphData{subgraph_nodes, {}, {}, {}};
+        CHECK(result == correct);
+      }
+    }
+
+    SUBCASE("3-node graph with inputs") {
+      OpenDataflowGraph graph =
+          OpenDataflowGraph::create<UnorderedSetDataflowGraph>();
+
+      DataflowGraphInput i0 = graph.add_input();
+      DataflowGraphInput i1 = graph.add_input();
+
+      NodeAddedResult n0_added = graph.add_node({OpenDataflowValue{i0}}, 1_n);
+      Node n0 = n0_added.node;
+      OpenDataflowValue v0 = OpenDataflowValue{get_only(n0_added.outputs)};
+
+      NodeAddedResult n1_added =
+          graph.add_node({v0, OpenDataflowValue{i1}}, 1_n);
+      Node n1 = n1_added.node;
+
+      NodeAddedResult n2_added = graph.add_node({v0}, 1_n);
+      Node n2 = n2_added.node;
+
+      SUBCASE("subgraph is full graph") {
+        std::unordered_set<Node> subgraph_nodes = {n0, n1, n2};
+
+        bidict<OpenDataflowValue, DataflowGraphInput>
+            full_graph_values_to_subgraph_inputs =
+                get_full_graph_values_to_subgraph_inputs(graph, subgraph_nodes);
+
+        OpenDataflowGraphData result = get_subgraph_data(
+            graph, subgraph_nodes, full_graph_values_to_subgraph_inputs);
+
+        OpenDataflowGraphData correct = OpenDataflowGraphData{
+            subgraph_nodes,
+            {
+                OpenDataflowEdge{DataflowInputEdge{i0, DataflowInput{n0, 0_n}}},
+                OpenDataflowEdge{DataflowInputEdge{i1, DataflowInput{n1, 1_n}}},
+                OpenDataflowEdge{DataflowEdge{DataflowOutput{n0, 0_n},
+                                              DataflowInput{n1, 0_n}}},
+                OpenDataflowEdge{{DataflowEdge{DataflowOutput{n0, 0_n},
+                                               DataflowInput{n2, 0_n}}}},
+            },
+            {i0, i1},
+            {
+                DataflowOutput{
+                    n0,
+                    0_n,
+                },
+                DataflowOutput{
+                    n1,
+                    0_n,
+                },
+                DataflowOutput{
+                    n2,
+                    0_n,
+                },
+            }};
+        CHECK(result == correct);
+      }
+
+      SUBCASE("subgraph is (n0, n1) split") {
+        std::unordered_set<Node> subgraph_nodes = {n0, n1};
+
+        bidict<OpenDataflowValue, DataflowGraphInput>
+            full_graph_values_to_subgraph_inputs =
+                get_full_graph_values_to_subgraph_inputs(graph, subgraph_nodes);
+
+        OpenDataflowGraphData result = get_subgraph_data(
+            graph, subgraph_nodes, full_graph_values_to_subgraph_inputs);
+
+        OpenDataflowGraphData correct = OpenDataflowGraphData{
+            subgraph_nodes,
+            {
+                OpenDataflowEdge{DataflowInputEdge{i0, DataflowInput{n0, 0_n}}},
+                OpenDataflowEdge{DataflowInputEdge{i1, DataflowInput{n1, 1_n}}},
+                OpenDataflowEdge{DataflowEdge{DataflowOutput{n0, 0_n},
+                                              DataflowInput{n1, 0_n}}},
+            },
+            {i0, i1},
+            {
+                DataflowOutput{
+                    n0,
+                    0_n,
+                },
+                DataflowOutput{
+                    n1,
+                    0_n,
+                },
+            }};
+        CHECK(result == correct);
+      }
+
+      SUBCASE("subgraph is (n0, n1) split") {
+        std::unordered_set<Node> subgraph_nodes = {n0, n1};
+
+        bidict<OpenDataflowValue, DataflowGraphInput>
+            full_graph_values_to_subgraph_inputs =
+                get_full_graph_values_to_subgraph_inputs(graph, subgraph_nodes);
+
+        OpenDataflowGraphData result = get_subgraph_data(
+            graph, subgraph_nodes, full_graph_values_to_subgraph_inputs);
+
+        OpenDataflowGraphData correct = OpenDataflowGraphData{
+            subgraph_nodes,
+            {
+                OpenDataflowEdge{DataflowInputEdge{i0, DataflowInput{n0, 0_n}}},
+                OpenDataflowEdge{DataflowInputEdge{i1, DataflowInput{n1, 1_n}}},
+                OpenDataflowEdge{DataflowEdge{DataflowOutput{n0, 0_n},
+                                              DataflowInput{n1, 0_n}}},
+            },
+            {i0, i1},
+            {
+                DataflowOutput{
+                    n0,
+                    0_n,
+                },
+                DataflowOutput{
+                    n1,
+                    0_n,
+                },
+            }};
+        CHECK(result == correct);
+      }
+
+      SUBCASE("subgraph is (n0, n2) split") {
+        std::unordered_set<Node> subgraph_nodes = {n0, n2};
+
+        bidict<OpenDataflowValue, DataflowGraphInput>
+            full_graph_values_to_subgraph_inputs =
+                get_full_graph_values_to_subgraph_inputs(graph, subgraph_nodes);
+
+        OpenDataflowGraphData result = get_subgraph_data(
+            graph, subgraph_nodes, full_graph_values_to_subgraph_inputs);
+
+        OpenDataflowGraphData correct = OpenDataflowGraphData{
+            subgraph_nodes,
+            {
+                OpenDataflowEdge{DataflowInputEdge{i0, DataflowInput{n0, 0_n}}},
+                OpenDataflowEdge{DataflowEdge{DataflowOutput{n0, 0_n},
+                                              DataflowInput{n2, 0_n}}},
+            },
+            {i0},
+            {
+                DataflowOutput{
+                    n0,
+                    0_n,
+                },
+                DataflowOutput{
+                    n2,
+                    0_n,
+                },
+            }};
+        CHECK(result == correct);
+      }
+
+      SUBCASE("subgraph is (n1, n2) split") {
+        std::unordered_set<Node> subgraph_nodes = {n1, n2};
+
+        bidict<OpenDataflowValue, DataflowGraphInput>
+            full_graph_values_to_subgraph_inputs =
+                get_full_graph_values_to_subgraph_inputs(graph, subgraph_nodes);
+
+        OpenDataflowGraphData result = get_subgraph_data(
+            graph, subgraph_nodes, full_graph_values_to_subgraph_inputs);
+
+        DataflowGraphInput n0_as_subgraph_input =
+            full_graph_values_to_subgraph_inputs.at_l(OpenDataflowValue{v0});
+
+        OpenDataflowGraphData correct = OpenDataflowGraphData{
+            subgraph_nodes,
+            {
+                OpenDataflowEdge{DataflowInputEdge{i1, DataflowInput{n1, 1_n}}},
+                OpenDataflowEdge{DataflowInputEdge{n0_as_subgraph_input,
+                                                   DataflowInput{n1, 0_n}}},
+                OpenDataflowEdge{DataflowInputEdge{n0_as_subgraph_input,
+                                                   DataflowInput{n2, 0_n}}},
+            },
+            {i1, n0_as_subgraph_input},
+            {
+                DataflowOutput{
+                    n1,
+                    0_n,
+                },
+                DataflowOutput{
+                    n2,
+                    0_n,
+                },
+            }};
+        CHECK(result == correct);
+      }
+    }
+  }
+}

From ea1a6dfb3d7d31aed39dc947348999d4447c5185 Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Fri, 23 May 2025 05:04:48 +0000
Subject: [PATCH 76/91] Add tests for positive_int

---
 .../include/utils/positive_int/positive_int.h |   2 +-
 .../src/utils/positive_int/positive_int.cc    |   4 +-
 .../src/utils/positive_int/positive_int.cc    | 479 +++++++++++++++++-
 3 files changed, 480 insertions(+), 5 deletions(-)

diff --git a/lib/utils/include/utils/positive_int/positive_int.h b/lib/utils/include/utils/positive_int/positive_int.h
index 9ff0f4da64..6ddddadf50 100644
--- a/lib/utils/include/utils/positive_int/positive_int.h
+++ b/lib/utils/include/utils/positive_int/positive_int.h
@@ -69,7 +69,7 @@ struct positive_int {
   friend float &operator/=(float &lhs, positive_int rhs);
 
   nonnegative_int operator%(positive_int other) const;
-  nonnegative_int operator%(nonnegative_int other) const;
+  friend nonnegative_int operator%(nonnegative_int lhs, positive_int rhs);
 
   int int_from_positive_int() const;
   nonnegative_int nonnegative_int_from_positive_int() const;
diff --git a/lib/utils/src/utils/positive_int/positive_int.cc b/lib/utils/src/utils/positive_int/positive_int.cc
index 93d4d17148..3c4b0b4440 100644
--- a/lib/utils/src/utils/positive_int/positive_int.cc
+++ b/lib/utils/src/utils/positive_int/positive_int.cc
@@ -215,8 +215,8 @@ nonnegative_int positive_int::operator%(positive_int other) const {
   return nonnegative_int{this->value_ % other.value_};
 }
 
-nonnegative_int positive_int::operator%(nonnegative_int other) const {
-  return nonnegative_int{this->value_ % other.unwrap_nonnegative()};
+nonnegative_int operator%(nonnegative_int lhs, positive_int rhs) {
+  return nonnegative_int{lhs.unwrap_nonnegative() % rhs.value_};
 }
 
 int positive_int::int_from_positive_int() const {
diff --git a/lib/utils/test/src/utils/positive_int/positive_int.cc b/lib/utils/test/src/utils/positive_int/positive_int.cc
index d35ea83aad..77ecbf854d 100644
--- a/lib/utils/test/src/utils/positive_int/positive_int.cc
+++ b/lib/utils/test/src/utils/positive_int/positive_int.cc
@@ -1,10 +1,485 @@
 #include "utils/positive_int/positive_int.h"
 #include <doctest/doctest.h>
+#include "test/utils/rapidcheck.h"
 
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("positive_int") {
-    CHECK_MESSAGE(false, "TODO: positive_int");
+  TEST_CASE("positive_int{int}") {
+    int x1 = 3;
+    int x2 = 4;
+
+    int zero = 0;
+    int negative = -3;
+
+    CHECK(positive_int{x1} == positive_int{x1});
+    CHECK(positive_int{x2} != positive_int{x1});
+
+    CHECK_THROWS(positive_int{zero});
+    CHECK_THROWS(positive_int{negative});
+  }
+
+  TEST_CASE("positive_int{size_t}") {
+    size_t x1 = 3;
+    size_t x2 = 4;
+
+    size_t zero = 0;
+
+    size_t maxint = static_cast<size_t>(std::numeric_limits<int>::max());
+    size_t overflow1 = static_cast<size_t>(std::numeric_limits<int>::max()) + 1;
+    size_t overflow2 = static_cast<size_t>(std::numeric_limits<int>::max()) + 2;
+
+    CHECK(positive_int{x1} == positive_int{x1});
+    CHECK(positive_int{x2} != positive_int{x1});
+
+    CHECK_THROWS(positive_int{zero});
+    CHECK(positive_int{maxint} == positive_int{maxint});
+    CHECK_THROWS(positive_int{overflow1});
+    CHECK_THROWS(positive_int{overflow2});
+  }
+
+  TEST_CASE("positive_int{nonnegative_int}") {
+    nonnegative_int x1 = 3_n;
+    nonnegative_int x2 = 4_n;
+
+    nonnegative_int zero = 0_n;
+
+    CHECK(positive_int{x1} == positive_int{x1});
+    CHECK(positive_int{x2} != positive_int{x1});
+
+    CHECK_THROWS(positive_int{zero});
+  }
+
+  TEST_CASE("_p notation for positive_int") {
+    CHECK(9_p == positive_int{9}); 
+    CHECK_THROWS(0_p);
+  }
+
+  TEST_CASE("static_cast<int>(positive_int)") {
+    CHECK(static_cast<int>(8_p) == 8);   
+  }
+
+  TEST_CASE("static_cast<nonnegative_int>(positive_int)") {
+    CHECK(static_cast<nonnegative_int>(6_p) == 6);   
+  }
+
+  TEST_CASE("positive_int < positive_int") {
+    CHECK(4_p < 5_p);
+    CHECK_FALSE(7_p < 7_p);
+    CHECK_FALSE(3_p < 2_p);
+  }
+
+  TEST_CASE("positive_int == positive_int") {
+    CHECK_FALSE(4_p == 5_p);
+    CHECK(7_p == 7_p);
+    CHECK_FALSE(3_p == 2_p);
+  }
+
+  TEST_CASE("positive_int > positive_int") {
+    CHECK_FALSE(4_p > 5_p);
+    CHECK_FALSE(7_p > 7_p);
+    CHECK(3_p > 2_p);
+  }
+
+  TEST_CASE("positive_int <= positive_int") {
+    CHECK(4_p <= 5_p);
+    CHECK(7_p <= 7_p);
+    CHECK_FALSE(3_p <= 2_p);
+  }
+
+  TEST_CASE("positive_int != positive_int") {
+    CHECK(4_p != 5_p);
+    CHECK_FALSE(7_p != 7_p);
+    CHECK(3_p != 2_p);
+  }
+
+  TEST_CASE("positive_int >= positive_int") {
+    CHECK_FALSE(4_p >= 5_p);
+    CHECK(7_p >= 7_p);
+    CHECK(3_p >= 2_p);
+  }
+
+  TEST_CASE("positive_int < nonnegative_int") {
+    CHECK(4_p < 5_n);
+    CHECK_FALSE(7_p < 7_n);
+    CHECK_FALSE(3_p < 2_n);
+    CHECK_FALSE(1_p < 0_n);
+  }
+
+  TEST_CASE("positive_int == nonnegative_int") {
+    CHECK_FALSE(4_p == 5_n);
+    CHECK(7_p == 7_n);
+    CHECK_FALSE(3_p == 2_n);
+    CHECK_FALSE(1_p == 0_n);
+  }
+
+  TEST_CASE("positive_int > nonnegative_int") {
+    CHECK_FALSE(4_p > 5_n);
+    CHECK_FALSE(7_p > 7_n);
+    CHECK(3_p > 2_n);
+    CHECK(1_p > 0_n);
+  }
+
+  TEST_CASE("positive_int <= nonnegative_int") {
+    CHECK(4_p <= 5_n);
+    CHECK(7_p <= 7_n);
+    CHECK_FALSE(3_p <= 2_n);
+    CHECK_FALSE(1_p <= 0_n);
+  }
+
+  TEST_CASE("positive_int != nonnegative_int") {
+    CHECK(4_p != 5_n);
+    CHECK_FALSE(7_p != 7_n);
+    CHECK(3_p != 2_n);
+    CHECK(1_p != 0_n);
+  }
+
+  TEST_CASE("positive_int >= nonnegative_int") {
+    CHECK_FALSE(4_p >= 5_n);
+    CHECK(7_p >= 7_n);
+    CHECK(3_p >= 2_n);
+    CHECK(1_p >= 0_n);
+  }
+
+  TEST_CASE("nonnegative_int < positive_int") {
+    CHECK(4_n < 5_p);
+    CHECK_FALSE(7_n < 7_p);
+    CHECK_FALSE(3_n < 2_p);
+    CHECK(0_n < 1_p);
+  }
+
+  TEST_CASE("nonnegative_int == positive_int") {
+    CHECK_FALSE(4_n == 5_p);
+    CHECK(7_n == 7_p);
+    CHECK_FALSE(3_n == 2_p);
+    CHECK_FALSE(0_n == 1_p);
+  }
+
+  TEST_CASE("nonnegative_int > positive_int") {
+    CHECK_FALSE(4_n > 5_p);
+    CHECK_FALSE(7_n > 7_p);
+    CHECK(3_n > 2_p);
+    CHECK_FALSE(0_n > 1_p);
+  }
+
+  TEST_CASE("nonnegative_int <= positive_int") {
+    CHECK(4_n <= 5_p);
+    CHECK(7_n <= 7_p);
+    CHECK_FALSE(3_n <= 2_p);
+    CHECK(0_n <= 1_p);
+  }
+
+  TEST_CASE("nonnegative_int != positive_int") {
+    CHECK(4_n != 5_p);
+    CHECK_FALSE(7_n != 7_p);
+    CHECK(3_n != 2_p);
+    CHECK(0_n != 1_p);
+  }
+
+  TEST_CASE("nonnegative_int >= positive_int") {
+    CHECK_FALSE(4_n >= 5_p);
+    CHECK(7_n >= 7_p);
+    CHECK(3_n >= 2_p);
+    CHECK_FALSE(0_n >= 1_p);
+  }
+
+  TEST_CASE("positive_int < int") {
+    CHECK(4_p < 5);
+    CHECK_FALSE(7_p < 7);
+    CHECK_FALSE(3_p < 2);
+    CHECK_FALSE(1_p < -3);
+  }
+
+  TEST_CASE("positive_int == int") {
+    CHECK_FALSE(4_p == 5);
+    CHECK(7_p == 7);
+    CHECK_FALSE(3_p == 2);
+    CHECK_FALSE(1_p == -3);
+  }
+
+  TEST_CASE("positive_int > int") {
+    CHECK_FALSE(4_p > 5);
+    CHECK_FALSE(7_p > 7);
+    CHECK(3_p > 2);
+    CHECK(1_p > -3);
+  }
+
+  TEST_CASE("positive_int <= int") {
+    CHECK(4_p <= 5);
+    CHECK(7_p <= 7);
+    CHECK_FALSE(3_p <= 2);
+    CHECK_FALSE(1_p <= -3);
+  }
+
+  TEST_CASE("positive_int != int") {
+    CHECK(4_p != 5);
+    CHECK_FALSE(7_p != 7);
+    CHECK(3_p != 2);
+    CHECK(1_p != -3);
+  }
+
+  TEST_CASE("positive_int >= int") {
+    CHECK_FALSE(4_p >= 5);
+    CHECK(7_p >= 7);
+    CHECK(3_p >= 2);
+    CHECK(1_p >= -3);
+  }
+
+  TEST_CASE("int < positive_int") {
+    CHECK(4 < 5_p);
+    CHECK_FALSE(7 < 7_p);
+    CHECK_FALSE(3 < 2_p);
+    CHECK(-3 < 1_p);
+  }
+
+  TEST_CASE("int == positive_int") {
+    CHECK_FALSE(4 == 5_p);
+    CHECK(7 == 7_p);
+    CHECK_FALSE(3 == 2_p);
+    CHECK_FALSE(-3 == 1_p);
+  }
+
+  TEST_CASE("int > positive_int") {
+    CHECK_FALSE(4 > 5_p);
+    CHECK_FALSE(7 > 7_p);
+    CHECK(3 > 2_p);
+    CHECK_FALSE(-3 > 1_p);
+  }
+
+  TEST_CASE("int <= positive_int") {
+    CHECK(4 <= 5_p);
+    CHECK(7 <= 7_p);
+    CHECK_FALSE(3 <= 2_p);
+    CHECK(-3 <= 1_p);
+  }
+
+  TEST_CASE("int != positive_int") {
+    CHECK(4 != 5_p);
+    CHECK_FALSE(7 != 7_p);
+    CHECK(3 != 2_p);
+    CHECK(-3 != 1_p);
+  }
+
+  TEST_CASE("int >= positive_int") {
+    CHECK_FALSE(4 >= 5_p);
+    CHECK(7 >= 7_p);
+    CHECK(3 >= 2_p);
+    CHECK_FALSE(-3 >= 1_p);
+  }
+
+  TEST_CASE("positive_int + positive_int") {
+    CHECK(4_p + 2_p == 6_p);
+  }
+
+  TEST_CASE("positive_int + nonnegative_int") {
+    CHECK(4_p + 3_n == 7_p);
+  }
+
+  TEST_CASE("++positive_int") {
+    positive_int x = 3_p;
+    CHECK(++x == 4_p);
+    CHECK(x == 4_p);
+  }
+
+  TEST_CASE("positive_int++") {
+    positive_int x = 3_p;
+    CHECK(x++ == 3_p);
+    CHECK(x == 4_p);
+  }
+
+  TEST_CASE("positive_int += positive_int ") {
+    positive_int x = 3_p;
+
+    SUBCASE("single application") {
+      CHECK((x += 2_p) == 5_p);
+      CHECK(x == 5_p);
+    }
+
+    SUBCASE("repeated application") {
+      CHECK(((x += 2_p) += 4_p) == 9_p);
+      CHECK(x == 9_p);
+    }
+  }
+
+  TEST_CASE("positive_int += nonnegative_int") {
+    positive_int x = 3_p;
+
+    SUBCASE("rhs is positive") {
+      CHECK((x += 2_n) == 5_p);
+      CHECK(x == 5_p);
+    }
+
+    SUBCASE("rhs is zero") {
+      CHECK((x += 0_n) == 3_p);
+      CHECK(x == 3_p);
+    }
+
+    SUBCASE("repeated application") {
+      CHECK(((x += 2_n) += 4_n) == 9_p);
+      CHECK(x == 9_p);
+    }
+  }
+
+  TEST_CASE("positive_int * positive_int") {
+    CHECK(3_p * 4_p == 12_p); 
+  }
+
+  TEST_CASE("positive_int *= positive_int") {
+    positive_int x = 5_p;
+
+    SUBCASE("single application") {
+      CHECK((x *= 2_p) == 10_p);
+      CHECK(x == 10_p);
+    }
+
+    SUBCASE("repeated application") {
+      CHECK(((x *= 2_p) *= 3_p) == 30_p);
+      CHECK(x == 30_p);
+    }
+  }
+
+  TEST_CASE("positive_int * nonnegative_int") {
+    CHECK(3_p * 4_n == 12_n); 
+    CHECK(3_p * 0_n == 0_n);
+  }
+
+  TEST_CASE("positive_int / positive_int") {
+    CHECK(4_p / 2_p == 2_n);
+    CHECK(4_p / 3_p == 1_n);
+    CHECK(4_p / 4_p == 1_n);
+    CHECK(4_p / 5_p == 0_n);
+  }
+
+  TEST_CASE("nonnegative_int / positive_int") {
+    CHECK(4_n / 2_p == 2_n);
+    CHECK(4_n / 3_p == 1_n);
+    CHECK(4_n / 4_p == 1_n);
+    CHECK(4_n / 5_p == 0_n);
+
+    CHECK(0_n / 1_p == 0_n);
+  }
+
+  TEST_CASE("float / positive_int") {
+    CHECK(4.0f / 2_p == 2.0f);   
+    CHECK(3.0f / 2_p == 1.5f);   
+    CHECK(-3.0f / 4_p == -0.75f);   
+    CHECK(0.0f / 1_p == 0.0f);   
+  }
+
+  TEST_CASE("float /= positive_int") {
+    SUBCASE("divides evenly") {
+      float x = 4.0f;
+      CHECK((x /= 2_p) == 2.0f);
+      CHECK(x == 2.0f);
+    }
+
+    SUBCASE("does not divide evenly") {
+      float x = 3.0f;
+      CHECK((x /= 2_p) == 1.5f);
+      CHECK(x == 1.5f);
+    }
+
+    SUBCASE("numerator is negative") {
+      float x = -3.0f;
+      CHECK((x /= 4_p) == -0.75f);
+      CHECK(x == -0.75f);
+    }
+
+    SUBCASE("numerator is zero") {
+      float x = 0.0f;
+      CHECK((x /= 4_p) == 0.0f);
+      CHECK(x == 0.0f);
+    }
+
+    SUBCASE("repeated /=") {
+      float x = 20.0f;
+      int x2 = 20;
+      CHECK(((x /= 4_p) /= 2_p) == 2.5f);
+      CHECK(x == 2.5f);
+    }
+  }
+
+  TEST_CASE("positive_int % positive_int") {
+    CHECK(4_p % 3_p == 1_n);
+    CHECK(5_p % 5_p == 0_n);
+  }
+
+  TEST_CASE("nonnegative_int % positive_int") {
+    CHECK(4_n % 3_p == 1_n);
+    CHECK(5_n % 5_p == 0_n);
+    CHECK(0_n % 3_p == 0_n);
+  }
+
+  TEST_CASE("positive_int::int_from_positive_int()") {
+    CHECK((3_p).int_from_positive_int() == 3); 
+  }
+
+  TEST_CASE("positive_int::nonnegative_int_from_positive_int()") {
+    CHECK((4_p).nonnegative_int_from_positive_int() == 4); 
+  }
+
+  TEST_CASE("positive_int::operator<<(std::ostream &, positive_int)") {
+    std::ostringstream oss;
+    oss << 3_p;
+
+    std::string result = oss.str();
+    std::string correct = "3";
+
+    CHECK(result == correct);
+  }
+
+  TEST_CASE("positive_int fmt support") {
+    std::string result = fmt::to_string(14_p);
+    std::string correct = "14";
+
+    CHECK(result == correct);
+  }
+
+  TEST_CASE("adl_serializer<positive_int>") {
+    SUBCASE("to_json") {
+      positive_int input = 5_p;
+
+      nlohmann::json result = input;
+      nlohmann::json correct = 5;
+
+      CHECK(result == correct);
+    } 
+
+    SUBCASE("from_json") {
+      nlohmann::json input = 5;
+
+      positive_int result = input.template get<positive_int>();
+      positive_int correct = 5_p;
+
+      CHECK(result == correct);
+    }
+  }
+
+  TEST_CASE("std::hash<nonnegative_int>") {
+    positive_int nn_int_1a = positive_int{1};
+    positive_int nn_int_1b = positive_int{1};
+    positive_int nn_int_2 = positive_int{2};
+    std::hash<positive_int> hash_fn;
+
+    SUBCASE("Identical values have the same hash") {
+      CHECK(hash_fn(nn_int_1a) == hash_fn(nn_int_1b));
+    }
+
+    SUBCASE("Different values have different hashes") {
+      CHECK(hash_fn(nn_int_1a) != hash_fn(nn_int_2));
+    }
+
+    SUBCASE("unordered_set works with positive_int") {
+      std::unordered_set<::FlexFlow::positive_int> positive_int_set;
+      positive_int_set.insert(nn_int_1a);
+      positive_int_set.insert(nn_int_1b);
+      positive_int_set.insert(nn_int_2);
+
+      CHECK(positive_int_set.size() == 2);
+    }
+  }
+
+  TEST_CASE("rc::Arbitrary<positive_int>") {
+    RC_SUBCASE([](positive_int) { });
   }
 }

From 9d4f90b54a0929b482aa33bd3032ee1026188063 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Mon, 26 May 2025 12:57:48 -0700
Subject: [PATCH 77/91] test: realm backend add e2e test

---
 .../local-execution/model_training_instance.h |   1 +
 .../src/model_training_instance.cc            |  10 ++
 lib/local-execution/test/src/test_e2e.cc      | 140 +++++++++++++++++
 .../test/src/test_local_cost_estimator.cc     |   4 +-
 .../test/src/test_loss_functions.cc           |   2 +-
 .../test/src/test_task_registry.cc            |   2 +-
 lib/local-execution/test/src/test_update.cc   |   2 +-
 .../realm-backend/model_training_instance.h   |   1 +
 .../src/model_training_instance.cc            |  10 ++
 lib/realm-backend/test/src/test_e2e.cc        | 145 ++++++++++++++++++
 10 files changed, 312 insertions(+), 5 deletions(-)
 create mode 100644 lib/local-execution/test/src/test_e2e.cc
 create mode 100644 lib/realm-backend/test/src/test_e2e.cc

diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h
index b36b20ed04..54b76313ab 100644
--- a/lib/local-execution/include/local-execution/model_training_instance.h
+++ b/lib/local-execution/include/local-execution/model_training_instance.h
@@ -30,6 +30,7 @@ struct ModelTrainingInstance {
   PerLayerElapsedTime forward();
   PerLayerElapsedTime backward();
   void update();
+  GenericTensorAccessorW get_loss_tensor_backing();
 };
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc
index d404221d88..f232011230 100644
--- a/lib/local-execution/src/model_training_instance.cc
+++ b/lib/local-execution/src/model_training_instance.cc
@@ -54,4 +54,14 @@ void ModelTrainingInstance::update() {
       get_optimizer_attrs_for_next_iter(this->optimizer_attrs);
 }
 
+GenericTensorAccessorW ModelTrainingInstance::get_loss_tensor_backing() {
+  gradient_tensor_t loss_tensor =
+      this->training_backing.local_tensor_backing
+          .tensor_gradient_mapping.at(this->logit_tensor);
+  GenericTensorAccessorW loss_tensor_backing =
+      this->training_backing.local_tensor_backing.tensor_backings.at(
+          TensorTypeVariant{loss_tensor});
+  return loss_tensor_backing;
+}
+
 } // namespace FlexFlow
diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc
new file mode 100644
index 0000000000..05aaab0c88
--- /dev/null
+++ b/lib/local-execution/test/src/test_e2e.cc
@@ -0,0 +1,140 @@
+#include "kernels/local_cuda_allocator.h"
+#include "kernels/managed_ff_stream.h"
+#include "kernels/managed_per_device_ff_handle.h"
+#include "local-execution/allocated_tensors.h"
+#include "local-execution/local_training_backing.h"
+#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
+#include "pcg/computation_graph.h"
+#include "pcg/computation_graph_builder.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "test_utils.h"
+#include "utils/containers/get_only.h"
+#include "local-execution/model_training_instance.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+bool did_loss_decrease(GenericTensorAccessorW const &first_epoch, GenericTensorAccessorW const & last_epoch) {
+  float* first_epoch_ptr = first_epoch.get_float_ptr();
+  float* last_epoch_ptr = last_epoch.get_float_ptr();
+  
+  int batch_size = first_epoch.shape.at(ff_dim_t{nonnegative_int{0}}).unwrap_nonnegative();
+  for (int i = 0; i < batch_size; i++) {
+    if (first_epoch_ptr[i] < last_epoch_ptr[i]) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("E2ETest") {
+    // initialize runtime
+    ManagedFFStream managed_stream{};
+    ManagedPerDeviceFFHandle managed_handle{};
+
+    Allocator allocator = create_local_cuda_memory_allocator();
+
+    // allocate label tensors
+    LossTensorSource loss_tensor_source;
+    loss_tensor_t label_tensor =
+        loss_tensor_source.new_loss_tensor();
+
+    nonnegative_int batch_size = 10_n;
+    nonnegative_int data_dim = 16_n;
+    nonnegative_int output_dim = 32_n;
+
+    TensorShape output_tensor_shape = TensorShape{
+        TensorDims{FFOrdered<nonnegative_int>{batch_size, output_dim}},
+        DataType::FLOAT};
+
+    GenericTensorAccessorW label_tensor_backing =
+        allocator.allocate_tensor(output_tensor_shape);
+    AllocatedTensors allocated_tensors = AllocatedTensors{
+        {
+         {TensorTypeVariant{label_tensor},
+         label_tensor_backing}},
+        {},
+        {}};
+
+    // construct computation graph
+    ComputationGraph computation_graph = make_empty_computation_graph();
+
+    TensorShape input_tensor_shape = TensorShape{
+        TensorDims{FFOrdered<nonnegative_int>{batch_size, data_dim}},
+        DataType::FLOAT};
+
+    TensorShape weight_shape = TensorShape{
+        TensorDims{FFOrdered<nonnegative_int>{data_dim, output_dim}},
+        DataType::FLOAT};
+
+    LayerAddedResult inputs_layer =
+        add_input_layer(computation_graph, input_tensor_shape);
+
+    LayerAddedResult weights_layer = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
+                       weight_shape, InitializerAttrs{GlorotNormalAttrs{0}}}},
+                   std::nullopt},
+        {},
+        {});
+
+    LayerAddedResult linear_operator = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim,
+                                                       /*use_bias=*/false,
+                                                       DataType::FLOAT,
+                                                       Activation::RELU,
+                                                       std::nullopt}},
+                   std::nullopt},
+        inputs_layer.outputs,
+        weights_layer.outputs);
+    tensor_guid_t logit_tensor = get_only(linear_operator.outputs);
+
+    RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
+        DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
+        EnableProfiling::YES,
+        ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}};
+
+    // initialize training backing
+    LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
+    OptimizerAttrs optimizer_attrs =
+        OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                         /*momentum=*/0.9,
+                                         /*nesterov=*/false,
+                                         /*weight_decay=*/0.001}};
+
+
+    GradientTensorSource gradient_tensor_source;
+    OptimizerTensorSource optimizer_tensor_source;
+
+    LocalTrainingBacking local_training_backing =
+        LocalTrainingBacking{allocator,
+                            allocated_tensors,
+                            gradient_tensor_source,
+                            optimizer_tensor_source,
+                            computation_graph,
+                            runtime_arg_config,
+                          optimizer_attrs};
+
+    // begin training loop                      
+    ModelTrainingInstance model_training_instance = ModelTrainingInstance{
+      allocator, local_training_backing, logit_tensor, label_tensor, loss_attrs, optimizer_attrs
+    };
+
+    int num_epochs = 10;
+    std::vector<GenericTensorAccessorW> loss_values (num_epochs);
+
+    for (int i = 0; i < num_epochs; i++) {
+      model_training_instance.forward();
+      model_training_instance.backward();
+      model_training_instance.update();
+      loss_values[i] = model_training_instance.get_loss_tensor_backing();
+    }
+    
+    // Assert that each sample in the batch has a lower loss in last epoch than the first epoch
+    CHECK(did_loss_decrease(loss_values[0], loss_values[num_epochs - 1]));
+  }
+}
\ No newline at end of file
diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc
index 30682c9a48..0fa841be20 100644
--- a/lib/local-execution/test/src/test_local_cost_estimator.cc
+++ b/lib/local-execution/test/src/test_local_cost_estimator.cc
@@ -9,8 +9,8 @@
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_CUDA_TEST_SUITE) {
-  TEST_CASE("Local Cost Estimator") {
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("LocalCostEstimator") {
     // local backing initialization
     ManagedPerDeviceFFHandle managed_handle{};
 
diff --git a/lib/local-execution/test/src/test_loss_functions.cc b/lib/local-execution/test/src/test_loss_functions.cc
index 2bf138e204..ae76dcccf9 100644
--- a/lib/local-execution/test/src/test_loss_functions.cc
+++ b/lib/local-execution/test/src/test_loss_functions.cc
@@ -14,7 +14,7 @@
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("Loss Functions") {
+  TEST_CASE("LossFunctions") {
     // initialize runtime
     ManagedFFStream managed_stream{};
     ManagedPerDeviceFFHandle managed_handle{};
diff --git a/lib/local-execution/test/src/test_task_registry.cc b/lib/local-execution/test/src/test_task_registry.cc
index dd4b6f5b44..16877b0e09 100644
--- a/lib/local-execution/test/src/test_task_registry.cc
+++ b/lib/local-execution/test/src/test_task_registry.cc
@@ -9,7 +9,7 @@
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("Task Registry") {
+  TEST_CASE("TaskRegistry") {
 
     layer_guid_t layer_guid = layer_guid_t{Node{0}};
     nonnegative_int embed_dim = 32_n;
diff --git a/lib/local-execution/test/src/test_update.cc b/lib/local-execution/test/src/test_update.cc
index 1f8684f38a..dcd9c025b3 100644
--- a/lib/local-execution/test/src/test_update.cc
+++ b/lib/local-execution/test/src/test_update.cc
@@ -12,7 +12,7 @@
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("Execute Update") {
+  TEST_CASE("ExecuteUpdate") {
     // initialize runtime configs
     ManagedFFStream managed_stream{};
     ManagedPerDeviceFFHandle managed_handle{};
diff --git a/lib/realm-backend/include/realm-backend/model_training_instance.h b/lib/realm-backend/include/realm-backend/model_training_instance.h
index 6c92b1de4a..bc9c79dccf 100644
--- a/lib/realm-backend/include/realm-backend/model_training_instance.h
+++ b/lib/realm-backend/include/realm-backend/model_training_instance.h
@@ -28,6 +28,7 @@ struct ModelTrainingInstance {
   PerLayerElapsedTime forward();
   PerLayerElapsedTime backward();
   void update();
+  GenericTensorAccessorW get_loss_tensor_backing();
 };
 
 } // namespace FlexFlow
diff --git a/lib/realm-backend/src/model_training_instance.cc b/lib/realm-backend/src/model_training_instance.cc
index 8ced02e95a..d420776e42 100644
--- a/lib/realm-backend/src/model_training_instance.cc
+++ b/lib/realm-backend/src/model_training_instance.cc
@@ -73,4 +73,14 @@ void ModelTrainingInstance::update() {
     this->optimizer_attrs);
 }
 
+GenericTensorAccessorW ModelTrainingInstance::get_loss_tensor_backing() {
+  gradient_tensor_t loss_tensor =
+      this->training_backing.realm_tensor_backing
+          .tensor_gradient_mapping.at(this->logit_tensor);
+  GenericTensorAccessorW loss_tensor_backing =
+      this->training_backing.realm_tensor_backing.tensor_backings.at(
+          TensorTypeVariant{loss_tensor});
+  return loss_tensor_backing;
+}
+
 } // namespace FlexFlow
diff --git a/lib/realm-backend/test/src/test_e2e.cc b/lib/realm-backend/test/src/test_e2e.cc
new file mode 100644
index 0000000000..040b268128
--- /dev/null
+++ b/lib/realm-backend/test/src/test_e2e.cc
@@ -0,0 +1,145 @@
+#include "kernels/local_cuda_allocator.h"
+#include "kernels/managed_ff_stream.h"
+#include "kernels/managed_per_device_ff_handle.h"
+#include "local-execution/allocated_tensors.h"
+#include "realm-backend/realm_allocator.h"
+#include "realm-backend/realm_training_backing.h"
+#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
+#include "pcg/computation_graph.h"
+#include "pcg/computation_graph_builder.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "test_utils.h"
+#include "utils/containers/get_only.h"
+#include "realm-backend/model_training_instance.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+bool did_loss_decrease(GenericTensorAccessorW const &first_epoch, GenericTensorAccessorW const & last_epoch) {
+  float* first_epoch_ptr = first_epoch.get_float_ptr();
+  float* last_epoch_ptr = last_epoch.get_float_ptr();
+  
+  int batch_size = first_epoch.shape.at(ff_dim_t{nonnegative_int{0}}).unwrap_nonnegative();
+  for (int i = 0; i < batch_size; i++) {
+    if (first_epoch_ptr[i] < last_epoch_ptr[i]) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+void top_level_task(const void *args, size_t arglen, const void *userdata,
+                    size_t userlen, Realm::Processor p) {
+    // initialize runtime
+    ManagedFFStream managed_stream{};
+    ManagedPerDeviceFFHandle managed_handle{};
+    std::vector<Processor> worker_procs;
+    std::vector<Allocator> allocators;
+    Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine())
+                                    .only_kind(Processor::TOC_PROC);
+    assert(pq.count() > 0);
+    for (Processor p : pq) {
+      worker_procs.push_back(p);
+      allocators.push_back(create_realm_memory_allocator(p));
+    }
+
+    // allocate label tensors
+    LossTensorSource loss_tensor_source;
+    loss_tensor_t label_tensor =
+        loss_tensor_source.new_loss_tensor();
+
+    nonnegative_int batch_size = 10_n;
+    nonnegative_int data_dim = 16_n;
+    nonnegative_int output_dim = 32_n;
+
+    TensorShape output_tensor_shape = TensorShape{
+        TensorDims{FFOrdered<nonnegative_int>{batch_size, output_dim}},
+        DataType::FLOAT};
+
+    GenericTensorAccessorW label_tensor_backing =
+        allocator.allocate_tensor(output_tensor_shape);
+    AllocatedTensors allocated_tensors = AllocatedTensors{
+        {
+         {TensorTypeVariant{label_tensor},
+         label_tensor_backing}},
+        {},
+        {}};
+
+    // construct computation graph
+    ComputationGraph computation_graph = make_empty_computation_graph();
+
+    TensorShape input_tensor_shape = TensorShape{
+        TensorDims{FFOrdered<nonnegative_int>{batch_size, data_dim}},
+        DataType::FLOAT};
+
+    TensorShape weight_shape = TensorShape{
+        TensorDims{FFOrdered<nonnegative_int>{data_dim, output_dim}},
+        DataType::FLOAT};
+
+    LayerAddedResult inputs_layer =
+        add_input_layer(computation_graph, input_tensor_shape);
+
+    LayerAddedResult weights_layer = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
+                       weight_shape, InitializerAttrs{GlorotNormalAttrs{0}}}},
+                   std::nullopt},
+        {},
+        {});
+
+    LayerAddedResult linear_operator = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim,
+                                                       /*use_bias=*/false,
+                                                       DataType::FLOAT,
+                                                       Activation::RELU,
+                                                       std::nullopt}},
+                   std::nullopt},
+        inputs_layer.outputs,
+        weights_layer.outputs);
+    tensor_guid_t logit_tensor = get_only(linear_operator.outputs);
+
+    RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
+        DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
+        EnableProfiling::YES,
+        ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}};
+
+    // initialize training backing
+    LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
+    OptimizerAttrs optimizer_attrs =
+        OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                         /*momentum=*/0.9,
+                                         /*nesterov=*/false,
+                                         /*weight_decay=*/0.001}};
+
+
+    GradientTensorSource gradient_tensor_source;
+    OptimizerTensorSource optimizer_tensor_source;
+
+    {
+      printf("\nRunning test %d: E2ETest...\n", 1);
+      RealmTrainingBacking realm_training_backing = RealmTrainingBacking(
+          p, worker_procs, allocators, allocated_tensors, gradient_tensor_source,
+          optimizer_tensor_source, computation_graph, runtime_arg_config,
+          optimizer_attrs);
+      // begin training loop                      
+      ModelTrainingInstance model_training_instance = ModelTrainingInstance{
+        allocator, realm_training_backing, logit_tensor, label_tensor, loss_attrs, optimizer_attrs
+      };
+
+      int num_epochs = 10;
+      std::vector<GenericTensorAccessorW> loss_values (num_epochs);
+
+      for (int i = 0; i < num_epochs; i++) {
+        model_training_instance.forward();
+        model_training_instance.backward();
+        model_training_instance.update();
+        loss_values[i] = model_training_instance.get_loss_tensor_backing();
+      }
+      // Assert that each sample in the batch has a lower loss in last epoch than the first epoch
+      CHECK(did_loss_decrease(loss_values[0], loss_values[num_epochs - 1]));
+      printf("passed\n");
+    }
+  }
+}
\ No newline at end of file

From f3e2a27555e8ad79727552363be75a45944de890 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 27 May 2025 11:19:44 -0700
Subject: [PATCH 78/91] tweak: minor

---
 lib/realm-backend/test/src/test_e2e.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/realm-backend/test/src/test_e2e.cc b/lib/realm-backend/test/src/test_e2e.cc
index 040b268128..659b0e5977 100644
--- a/lib/realm-backend/test/src/test_e2e.cc
+++ b/lib/realm-backend/test/src/test_e2e.cc
@@ -1,4 +1,3 @@
-#include "kernels/local_cuda_allocator.h"
 #include "kernels/managed_ff_stream.h"
 #include "kernels/managed_per_device_ff_handle.h"
 #include "local-execution/allocated_tensors.h"
@@ -14,6 +13,7 @@
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
+using namespace Realm;
 
 bool did_loss_decrease(GenericTensorAccessorW const &first_epoch, GenericTensorAccessorW const & last_epoch) {
   float* first_epoch_ptr = first_epoch.get_float_ptr();
@@ -58,7 +58,7 @@ void top_level_task(const void *args, size_t arglen, const void *userdata,
         DataType::FLOAT};
 
     GenericTensorAccessorW label_tensor_backing =
-        allocator.allocate_tensor(output_tensor_shape);
+        allocators[0].allocate_tensor(output_tensor_shape);
     AllocatedTensors allocated_tensors = AllocatedTensors{
         {
          {TensorTypeVariant{label_tensor},
@@ -125,7 +125,7 @@ void top_level_task(const void *args, size_t arglen, const void *userdata,
           optimizer_attrs);
       // begin training loop                      
       ModelTrainingInstance model_training_instance = ModelTrainingInstance{
-        allocator, realm_training_backing, logit_tensor, label_tensor, loss_attrs, optimizer_attrs
+        realm_training_backing, logit_tensor, label_tensor, loss_attrs, optimizer_attrs
       };
 
       int num_epochs = 10;

From ba85fe4c022622a88fa230a0d0446c3607921cd9 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Wed, 30 Apr 2025 15:32:05 -0700
Subject: [PATCH 79/91] Pass cost estimator test

---
 .../include/local-execution/task_argument_accessor.h |  8 +++++++-
 lib/local-execution/src/local_cost_estimator.cc      | 12 ++++++------
 lib/local-execution/src/local_training_backing.cc    |  7 ++++---
 .../test/src/test_local_cost_estimator.cc            |  2 +-
 .../src/per_device_op_state.cc}                      |  0
 5 files changed, 18 insertions(+), 11 deletions(-)
 rename lib/{local-execution/src/per_device_state.cc => task-spec/src/per_device_op_state.cc} (100%)

diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h
index 99c1c1296b..285b41991a 100644
--- a/lib/local-execution/include/local-execution/task_argument_accessor.h
+++ b/lib/local-execution/include/local-execution/task_argument_accessor.h
@@ -14,7 +14,13 @@ struct TaskArgumentAccessor {
     if constexpr (PerDeviceOpState::IsPartOfPerDeviceOpState_v<T>) {
       PerDeviceOpState device_states =
           this->ptr->get_concrete_arg(slot).get<PerDeviceOpState>();
-      return device_states.get<T>();
+      if (device_states.has<T>()) {
+        return device_states.get<T>();
+      } else {
+        throw mk_runtime_error(
+            fmt::format("Invalid access to PerDeviceOpState attempted, instead it holds: ",
+                        device_states.index()));
+      }
     } else {
       return this->ptr->get_concrete_arg(slot).get<T>();
     }
diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
index 532fcc91c2..0ee6c9a987 100644
--- a/lib/local-execution/src/local_cost_estimator.cc
+++ b/lib/local-execution/src/local_cost_estimator.cc
@@ -90,12 +90,12 @@ CostDetails LocalCostEstimator::estimate_cost(
                                      computation_graph,
                                      this->runtime_arg_config);
   // execute layer
-  layer_guid_t operator_layer_guid =
-      get_layer_by_name(computation_graph, "operator");
-  float fwd =
-      execute_forward(local_backing, operator_layer_guid, allocator).value();
-  float bwd =
-      execute_backward(local_backing, operator_layer_guid, allocator).value();
+  layer_guid_t operator_layer_guid = get_layer_by_name(computation_graph, "operator");
+  
+  float fwd = execute_forward(local_backing, operator_layer_guid, allocator).value();
+  std::cout << "completed forward" << std::endl;
+  float bwd = execute_backward(local_backing, operator_layer_guid, allocator).value();
+  std::cout << "completed  backward" << std::endl;
 
   float total_execution_time = fwd + bwd;
 
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index b2e0a2fb7e..7d916715f5 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -104,8 +104,7 @@ std::optional<float> call_task_impl(TaskRegistry const &task_registry,
                                     task_id_t const &task_id,
                                     TaskArgumentAccessor const &acc) {
   TaskSignatureAndImpl task_sig_impl = task_registry.task_mapping.at(task_id);
-  auto fn =
-      task_sig_impl.impl_function.get<FwdBwdOpTaskImplFunction>().function_ptr;
+  auto fn = task_sig_impl.impl_function.get<FwdBwdOpTaskImplFunction>().function_ptr;
   return fn(acc);
 }
 
@@ -116,13 +115,15 @@ std::optional<float>
   if (registry_contains_task_for_layer(local_training_backing.task_registry,
                                        operator_node,
                                        OpTaskType::FWD)) {
+    
     ComputationGraphOpAttrs attrs =
         get_layer_attrs(local_training_backing.computation_graph, operator_node)
             .op_attrs;
-
+    
     std::optional<DeviceSpecificDeviceStates> device_state =
         get_per_device_op_state_if_exists(
             local_training_backing.local_args_backing, operator_node);
+    
     TaskInvocation invocation = lower_to_task_invocation(
         forward(attrs),
         operator_node,
diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc
index 0fa841be20..e493265f86 100644
--- a/lib/local-execution/test/src/test_local_cost_estimator.cc
+++ b/lib/local-execution/test/src/test_local_cost_estimator.cc
@@ -31,7 +31,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*kdim=*/embed_dim,
           /*vdim=*/embed_dim,
           /*dropout=*/0.0,
-          /*bias=*/true,
+          /*bias=*/false,
           /*add_bias_kv=*/false,
           /*add_zero_attn=*/false,
       };
diff --git a/lib/local-execution/src/per_device_state.cc b/lib/task-spec/src/per_device_op_state.cc
similarity index 100%
rename from lib/local-execution/src/per_device_state.cc
rename to lib/task-spec/src/per_device_op_state.cc

From ed0a164042c86dd063d40316271b374ef422215b Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 28 May 2025 02:10:09 -0700
Subject: [PATCH 80/91] feat: fix e2e test

---
 lib/kernels/include/kernels/accessor.h        |   1 +
 .../kernels/managed_per_device_ff_handle.h    |   6 +-
 lib/kernels/src/accessor.cc                   |  10 +
 .../src/managed_per_device_ff_handle.cc       |  22 +-
 lib/kernels/test/src/test_attention_kernel.cc |   2 +-
 .../test/src/test_batch_matmul_kernel.cc      |   2 +-
 .../test/src/test_batch_norm_kernel.cc        |   2 +-
 lib/kernels/test/src/test_combine_kernel.cc   |   2 +-
 lib/kernels/test/src/test_concat_kernel.cc    |   2 +-
 lib/kernels/test/src/test_dropout.cc          |   2 +-
 lib/kernels/test/src/test_flat_kernel.cc      |   2 +-
 lib/kernels/test/src/test_gather_kernels.cc   |   2 +-
 .../test/src/test_layer_norm_kernels.cc       |   2 +-
 lib/kernels/test/src/test_partition_kernel.cc |   2 +-
 lib/kernels/test/src/test_pool_2d_kernels.cc  |   2 +-
 lib/kernels/test/src/test_reduction_kernel.cc |   2 +-
 lib/kernels/test/src/test_replicate_kernel.cc |   2 +-
 lib/kernels/test/src/test_reshape_kernel.cc   |   2 +-
 lib/kernels/test/src/test_reverse_kernels.cc  |   2 +-
 lib/kernels/test/src/test_softmax_kernel.cc   |   2 +-
 lib/kernels/test/src/test_split_kernel.cc     |   2 +-
 lib/kernels/test/src/test_transpose_kernel.cc |   2 +-
 .../local-execution/model_training_instance.h |   2 +-
 .../local-execution/task_argument_accessor.h  |   6 +-
 .../src/local-execution/ops/linear.cc         |   6 +-
 .../src/local_cost_estimator.cc               |  11 +-
 .../src/local_training_backing.cc             |   9 +-
 .../src/model_training_instance.cc            |   4 +-
 lib/local-execution/src/optimizer.cc          |   2 +-
 lib/local-execution/test/src/test_e2e.cc      | 122 +++++----
 .../test/src/test_local_cost_estimator.cc     |   2 +-
 .../test/src/test_loss_functions.cc           |   2 +-
 lib/local-execution/test/src/test_update.cc   |   2 +-
 lib/pcg/include/pcg/computation_graph.h       |   2 +
 lib/pcg/src/pcg/computation_graph.cc          |  14 +
 .../realm-backend/model_training_instance.h   |   2 +-
 .../include/realm-backend/task_wrapper.h      |   6 +-
 .../src/model_training_instance.cc            |   4 +-
 .../src/realm_training_backing.cc             |   4 +-
 lib/realm-backend/src/task_wrapper.cc         |  32 ++-
 lib/realm-backend/test/src/test_e2e.cc        | 255 ++++++++++--------
 lib/realm-backend/test/src/test_update.cc     |   2 +-
 42 files changed, 341 insertions(+), 223 deletions(-)

diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h
index 39da65c3be..55b120b090 100644
--- a/lib/kernels/include/kernels/accessor.h
+++ b/lib/kernels/include/kernels/accessor.h
@@ -75,6 +75,7 @@ std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &);
 int32_t *get_int32_ptr(GenericTensorAccessorW const &);
 int64_t *get_int64_ptr(GenericTensorAccessorW const &);
 float *get_float_ptr(GenericTensorAccessorW const &);
+void write_to_host_float_ptr(GenericTensorAccessorW const &, float *);
 double *get_double_ptr(GenericTensorAccessorW const &);
 half *get_half_ptr(GenericTensorAccessorW const &);
 std::vector<int32_t *>
diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h
index 0a83a5eecb..05e8406de8 100644
--- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h
+++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h
@@ -7,7 +7,7 @@ namespace FlexFlow {
 
 struct ManagedPerDeviceFFHandle {
 public:
-  ManagedPerDeviceFFHandle();
+  ManagedPerDeviceFFHandle(int num_ranks, int my_rank);
 
   ManagedPerDeviceFFHandle(ManagedPerDeviceFFHandle const &) = delete;
   ManagedPerDeviceFFHandle &
@@ -25,6 +25,10 @@ struct ManagedPerDeviceFFHandle {
   PerDeviceFFHandle *handle;
 };
 
+ManagedPerDeviceFFHandle initialize_single_gpu_handle();
+ManagedPerDeviceFFHandle initialize_multi_gpu_handle(int num_ranks,
+                                                     int my_rank);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc
index 27b7eb390d..7f4f61c271 100644
--- a/lib/kernels/src/accessor.cc
+++ b/lib/kernels/src/accessor.cc
@@ -1,4 +1,5 @@
 #include "kernels/accessor.h"
+#include "device.h"
 
 namespace FlexFlow {
 
@@ -76,6 +77,15 @@ float *get_float_ptr(GenericTensorAccessorW const &a) {
   return get<DataType::FLOAT>(a);
 }
 
+void write_to_host_float_ptr(GenericTensorAccessorW const &a, float *host_ptr) {
+  float *device_ptr = get<DataType::FLOAT>(a);
+  int total_elements = get_volume(a.shape).unwrap_nonnegative();
+  checkCUDA(cudaMemcpy(host_ptr,
+                       device_ptr,
+                       total_elements * sizeof(float),
+                       cudaMemcpyDeviceToHost));
+}
+
 double *get_double_ptr(GenericTensorAccessorW const &a) {
   return get<DataType::DOUBLE>(a);
 }
diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc
index c050e887b6..e327a7b1e1 100644
--- a/lib/kernels/src/managed_per_device_ff_handle.cc
+++ b/lib/kernels/src/managed_per_device_ff_handle.cc
@@ -1,9 +1,10 @@
 #include "kernels/managed_per_device_ff_handle.h"
 #include "device.h"
+#include "kernels/nccl.h"
 
 namespace FlexFlow {
 
-ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle() {
+ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle(int num_ranks, int my_rank) {
   handle = new PerDeviceFFHandle;
   handle->workSpaceSize = 1024 * 1024;
   handle->allowTensorOpMathConversion = true;
@@ -11,6 +12,13 @@ ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle() {
   checkCUDNN(cudnnCreate(&handle->dnn));
   checkCUBLAS(cublasCreate(&handle->blas));
   checkCUDA(cudaMalloc(&handle->workSpace, handle->workSpaceSize));
+
+#ifdef FF_USE_NCCL
+  ncclUniqueId ncclId;
+  checkNCCL(ncclGetUniqueId(&ncclId));
+  checkNCCL(ncclCommInitRank(
+      &handle->ncclComm, num_ranks, ncclId, my_rank)); // todo generalize
+#endif
 }
 
 ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle(
@@ -28,6 +36,9 @@ ManagedPerDeviceFFHandle::~ManagedPerDeviceFFHandle() {
     checkCUDNN(cudnnDestroy(handle->dnn));
     checkCUBLAS(cublasDestroy(handle->blas));
     checkCUDA(cudaFree(handle->workSpace));
+#ifdef FF_USE_NCCL
+    checkNCCL(ncclCommDestroy(handle->ncclComm));
+#endif
     delete handle;
   }
 }
@@ -36,4 +47,13 @@ PerDeviceFFHandle const &ManagedPerDeviceFFHandle::raw_handle() const {
   return *handle;
 }
 
+ManagedPerDeviceFFHandle initialize_single_gpu_handle() {
+  return ManagedPerDeviceFFHandle(1, 0);
+}
+
+ManagedPerDeviceFFHandle initialize_multi_gpu_handle(int num_ranks,
+                                                     int my_rank) {
+  return ManagedPerDeviceFFHandle(num_ranks, my_rank);
+}
+
 } // namespace FlexFlow
diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc
index 64264f6c39..a15497984c 100644
--- a/lib/kernels/test/src/test_attention_kernel.cc
+++ b/lib/kernels/test/src/test_attention_kernel.cc
@@ -19,7 +19,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     nonnegative_int kvSeqLength = 20_n;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc
index cacd5b60fb..b9cfbf3ec5 100644
--- a/lib/kernels/test/src/test_batch_matmul_kernel.cc
+++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc
@@ -15,7 +15,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     int seq_length = -1;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc
index b4c43cf1d8..94ce268b93 100644
--- a/lib/kernels/test/src/test_batch_norm_kernel.cc
+++ b/lib/kernels/test/src/test_batch_norm_kernel.cc
@@ -12,7 +12,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     nonnegative_int output_w = 10_n;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc
index 2b6b9bf589..68f35cb099 100644
--- a/lib/kernels/test/src/test_combine_kernel.cc
+++ b/lib/kernels/test/src/test_combine_kernel.cc
@@ -5,7 +5,7 @@
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test combine kernel") {
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc
index 215e599716..ca6b95dadc 100644
--- a/lib/kernels/test/src/test_concat_kernel.cc
+++ b/lib/kernels/test/src/test_concat_kernel.cc
@@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     nonnegative_int size_per_input = 100_n;
     ff_dim_t concat_axis = ff_dim_t{0_n};
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     TensorShape input_shape =
diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc
index 86f8f2102b..7e78544df8 100644
--- a/lib/kernels/test/src/test_dropout.cc
+++ b/lib/kernels/test/src/test_dropout.cc
@@ -18,7 +18,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorShape output_shape = input_shape;
 
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc
index 83f7f0445e..c9e1778843 100644
--- a/lib/kernels/test/src/test_flat_kernel.cc
+++ b/lib/kernels/test/src/test_flat_kernel.cc
@@ -7,7 +7,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Flat Kernel") {
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc
index 1a8cf5f82a..ffe8e0dfd2 100644
--- a/lib/kernels/test/src/test_gather_kernels.cc
+++ b/lib/kernels/test/src/test_gather_kernels.cc
@@ -5,7 +5,7 @@
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Gather Forward and Backward Kernel") {
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc
index 5386c1d943..9e89c86433 100644
--- a/lib/kernels/test/src/test_layer_norm_kernels.cc
+++ b/lib/kernels/test/src/test_layer_norm_kernels.cc
@@ -17,7 +17,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorShape feature_shape =
         make_float_tensor_shape_from_legion_dims({feature_size});
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc
index 4fd1b53210..281a146a30 100644
--- a/lib/kernels/test/src/test_partition_kernel.cc
+++ b/lib/kernels/test/src/test_partition_kernel.cc
@@ -6,7 +6,7 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Partition Forward and Backward") {
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc
index 62b61707c6..874e2b8d98 100644
--- a/lib/kernels/test/src/test_pool_2d_kernels.cc
+++ b/lib/kernels/test/src/test_pool_2d_kernels.cc
@@ -22,7 +22,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     PoolOp pool_type = PoolOp::MAX;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc
index 04a3817b84..7f993c12d3 100644
--- a/lib/kernels/test/src/test_reduction_kernel.cc
+++ b/lib/kernels/test/src/test_reduction_kernel.cc
@@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorShape input_shape = make_float_tensor_shape_from_legion_dims(
         {10_n, 10_n, 10_n, 10_n, 10_n});
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index fa726898f2..8c47c2a49a 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
     TensorShape output_shape = input_shape;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc
index d329a347b3..1e969f6d82 100644
--- a/lib/kernels/test/src/test_reshape_kernel.cc
+++ b/lib/kernels/test/src/test_reshape_kernel.cc
@@ -5,7 +5,7 @@
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Reshape Forward and Backward") {
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index 9c8475f6d6..ba808c491a 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -12,7 +12,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
     TensorShape output_shape = input_shape;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc
index c9eaa76b86..cba293aed1 100644
--- a/lib/kernels/test/src/test_softmax_kernel.cc
+++ b/lib/kernels/test/src/test_softmax_kernel.cc
@@ -12,7 +12,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     nonnegative_int input_w = 100_n;
     nonnegative_int channels = 100_n;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc
index ea0d280f68..65d1ed7783 100644
--- a/lib/kernels/test/src/test_split_kernel.cc
+++ b/lib/kernels/test/src/test_split_kernel.cc
@@ -12,7 +12,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     coord_t in_blk_size = 100;
     coord_t num_blks = 1;
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc
index 02d99c86a1..f7007d76e4 100644
--- a/lib/kernels/test/src/test_transpose_kernel.cc
+++ b/lib/kernels/test/src/test_transpose_kernel.cc
@@ -12,7 +12,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         },
     };
 
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h
index 54b76313ab..2deed6b0a2 100644
--- a/lib/local-execution/include/local-execution/model_training_instance.h
+++ b/lib/local-execution/include/local-execution/model_training_instance.h
@@ -30,7 +30,7 @@ struct ModelTrainingInstance {
   PerLayerElapsedTime forward();
   PerLayerElapsedTime backward();
   void update();
-  GenericTensorAccessorW get_loss_tensor_backing();
+  void write_loss_tensor_to_host(float *host_ptr);
 };
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h
index 285b41991a..499b5ff7d6 100644
--- a/lib/local-execution/include/local-execution/task_argument_accessor.h
+++ b/lib/local-execution/include/local-execution/task_argument_accessor.h
@@ -17,9 +17,9 @@ struct TaskArgumentAccessor {
       if (device_states.has<T>()) {
         return device_states.get<T>();
       } else {
-        throw mk_runtime_error(
-            fmt::format("Invalid access to PerDeviceOpState attempted, instead it holds: ",
-                        device_states.index()));
+        throw mk_runtime_error(fmt::format(
+            "Invalid access to PerDeviceOpState attempted, instead it holds: ",
+            device_states.index()));
       }
     } else {
       return this->ptr->get_concrete_arg(slot).get<T>();
diff --git a/lib/local-execution/src/local-execution/ops/linear.cc b/lib/local-execution/src/local-execution/ops/linear.cc
index 94f92d37ee..768293b32f 100644
--- a/lib/local-execution/src/local-execution/ops/linear.cc
+++ b/lib/local-execution/src/local-execution/ops/linear.cc
@@ -89,7 +89,6 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  auto bias = acc.get_tensor<Permissions::RO>(BIAS);
 
   auto per_device_state =
       acc.get_argument<LinearPerDeviceState>(PER_DEVICE_STATE);
@@ -102,6 +101,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   float const *bias_ptr = NULL;
   if (attrs.use_bias) {
+    auto bias = acc.get_tensor<Permissions::RO>(BIAS);
     bias_ptr = bias.get_float_ptr();
   }
 
@@ -118,14 +118,11 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  batch_size.unwrap_nonnegative());
 }
 
-;
-
 static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  auto bias = acc.get_tensor<Permissions::RO>(BIAS);
 
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto weight_grad = acc.get_tensor_grad<Permissions::RW>(WEIGHT);
@@ -137,6 +134,7 @@ static std::optional<float>
 
   float const *bias_ptr = NULL;
   if (attrs.use_bias) {
+    auto bias = acc.get_tensor<Permissions::RO>(BIAS);
     bias_ptr = bias.get_float_ptr();
   }
 
diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
index 0ee6c9a987..0a84c19066 100644
--- a/lib/local-execution/src/local_cost_estimator.cc
+++ b/lib/local-execution/src/local_cost_estimator.cc
@@ -90,11 +90,14 @@ CostDetails LocalCostEstimator::estimate_cost(
                                      computation_graph,
                                      this->runtime_arg_config);
   // execute layer
-  layer_guid_t operator_layer_guid = get_layer_by_name(computation_graph, "operator");
-  
-  float fwd = execute_forward(local_backing, operator_layer_guid, allocator).value();
+  layer_guid_t operator_layer_guid =
+      get_layer_by_name(computation_graph, "operator");
+
+  float fwd =
+      execute_forward(local_backing, operator_layer_guid, allocator).value();
   std::cout << "completed forward" << std::endl;
-  float bwd = execute_backward(local_backing, operator_layer_guid, allocator).value();
+  float bwd =
+      execute_backward(local_backing, operator_layer_guid, allocator).value();
   std::cout << "completed  backward" << std::endl;
 
   float total_execution_time = fwd + bwd;
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index 7d916715f5..d508c34210 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -104,7 +104,8 @@ std::optional<float> call_task_impl(TaskRegistry const &task_registry,
                                     task_id_t const &task_id,
                                     TaskArgumentAccessor const &acc) {
   TaskSignatureAndImpl task_sig_impl = task_registry.task_mapping.at(task_id);
-  auto fn = task_sig_impl.impl_function.get<FwdBwdOpTaskImplFunction>().function_ptr;
+  auto fn =
+      task_sig_impl.impl_function.get<FwdBwdOpTaskImplFunction>().function_ptr;
   return fn(acc);
 }
 
@@ -115,15 +116,15 @@ std::optional<float>
   if (registry_contains_task_for_layer(local_training_backing.task_registry,
                                        operator_node,
                                        OpTaskType::FWD)) {
-    
+
     ComputationGraphOpAttrs attrs =
         get_layer_attrs(local_training_backing.computation_graph, operator_node)
             .op_attrs;
-    
+
     std::optional<DeviceSpecificDeviceStates> device_state =
         get_per_device_op_state_if_exists(
             local_training_backing.local_args_backing, operator_node);
-    
+
     TaskInvocation invocation = lower_to_task_invocation(
         forward(attrs),
         operator_node,
diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc
index f232011230..d214d0d426 100644
--- a/lib/local-execution/src/model_training_instance.cc
+++ b/lib/local-execution/src/model_training_instance.cc
@@ -54,14 +54,14 @@ void ModelTrainingInstance::update() {
       get_optimizer_attrs_for_next_iter(this->optimizer_attrs);
 }
 
-GenericTensorAccessorW ModelTrainingInstance::get_loss_tensor_backing() {
+void ModelTrainingInstance::write_loss_tensor_to_host(float *host_ptr) {
   gradient_tensor_t loss_tensor =
       this->training_backing.local_tensor_backing
           .tensor_gradient_mapping.at(this->logit_tensor);
   GenericTensorAccessorW loss_tensor_backing =
       this->training_backing.local_tensor_backing.tensor_backings.at(
           TensorTypeVariant{loss_tensor});
-  return loss_tensor_backing;
+  write_to_host_float_ptr(loss_tensor_backing, host_ptr);
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc
index 1b9ce83d14..1b8fc37b2d 100644
--- a/lib/local-execution/src/optimizer.cc
+++ b/lib/local-execution/src/optimizer.cc
@@ -70,7 +70,7 @@ static void sgd_update_task_impl(TaskArgumentAccessor const &acc) {
   int size = weight_grad.shape.get_volume().unwrap_nonnegative();
 
   assert(weight_grad.shape.get_volume().unwrap_nonnegative() &
-         weight.shape.get_volume().unwrap_nonnegative() == 0);
+         weight.shape.get_volume().unwrap_nonnegative());
   int num_replicas = weight_grad.shape.get_volume().unwrap_nonnegative() /
                      weight.shape.get_volume().unwrap_nonnegative();
 
diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc
index 05aaab0c88..ccad60a900 100644
--- a/lib/local-execution/test/src/test_e2e.cc
+++ b/lib/local-execution/test/src/test_e2e.cc
@@ -3,48 +3,42 @@
 #include "kernels/managed_per_device_ff_handle.h"
 #include "local-execution/allocated_tensors.h"
 #include "local-execution/local_training_backing.h"
+#include "local-execution/model_training_instance.h"
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
 #include "pcg/computation_graph.h"
 #include "pcg/computation_graph_builder.h"
 #include "pcg/optimizer_attrs.dtg.h"
 #include "test_utils.h"
 #include "utils/containers/get_only.h"
-#include "local-execution/model_training_instance.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
-bool did_loss_decrease(GenericTensorAccessorW const &first_epoch, GenericTensorAccessorW const & last_epoch) {
-  float* first_epoch_ptr = first_epoch.get_float_ptr();
-  float* last_epoch_ptr = last_epoch.get_float_ptr();
-  
-  int batch_size = first_epoch.shape.at(ff_dim_t{nonnegative_int{0}}).unwrap_nonnegative();
+bool did_loss_decrease(float *first_epoch, float *last_epoch, int batch_size) {
   for (int i = 0; i < batch_size; i++) {
-    if (first_epoch_ptr[i] < last_epoch_ptr[i]) {
+    if (first_epoch[i] < last_epoch[i]) {
       return false;
     }
   }
-
   return true;
 }
 
-
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("E2ETest") {
     // initialize runtime
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
     // allocate label tensors
     LossTensorSource loss_tensor_source;
-    loss_tensor_t label_tensor =
-        loss_tensor_source.new_loss_tensor();
+    loss_tensor_t label_tensor = loss_tensor_source.new_loss_tensor();
 
     nonnegative_int batch_size = 10_n;
     nonnegative_int data_dim = 16_n;
-    nonnegative_int output_dim = 32_n;
+    nonnegative_int hidden_dim = 32_n;
+    nonnegative_int output_dim = 1_n;
 
     TensorShape output_tensor_shape = TensorShape{
         TensorDims{FFOrdered<nonnegative_int>{batch_size, output_dim}},
@@ -53,11 +47,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     GenericTensorAccessorW label_tensor_backing =
         allocator.allocate_tensor(output_tensor_shape);
     AllocatedTensors allocated_tensors = AllocatedTensors{
-        {
-         {TensorTypeVariant{label_tensor},
-         label_tensor_backing}},
-        {},
-        {}};
+        {{TensorTypeVariant{label_tensor}, label_tensor_backing}}, {}, {}};
 
     // construct computation graph
     ComputationGraph computation_graph = make_empty_computation_graph();
@@ -66,32 +56,55 @@ TEST_SUITE(FF_TEST_SUITE) {
         TensorDims{FFOrdered<nonnegative_int>{batch_size, data_dim}},
         DataType::FLOAT};
 
-    TensorShape weight_shape = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{data_dim, output_dim}},
+    TensorShape weight_shape_1 = TensorShape{
+        TensorDims{FFOrdered<nonnegative_int>{data_dim, hidden_dim}},
+        DataType::FLOAT};
+    TensorShape weight_shape_2 = TensorShape{
+        TensorDims{FFOrdered<nonnegative_int>{hidden_dim, output_dim}},
         DataType::FLOAT};
 
     LayerAddedResult inputs_layer =
-        add_input_layer(computation_graph, input_tensor_shape);
+        add_input_layer_with_grad(computation_graph, input_tensor_shape);
 
-    LayerAddedResult weights_layer = add_layer(
+    LayerAddedResult weights_layer_1 = add_layer(
         computation_graph,
         LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
-                       weight_shape, InitializerAttrs{GlorotNormalAttrs{0}}}},
+                       weight_shape_1, InitializerAttrs{GlorotNormalAttrs{0}}}},
                    std::nullopt},
         {},
         {});
 
-    LayerAddedResult linear_operator = add_layer(
+    LayerAddedResult weights_layer_2 = add_layer(
         computation_graph,
-        LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim,
+        LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
+                       weight_shape_2, InitializerAttrs{GlorotNormalAttrs{0}}}},
+                   std::nullopt},
+        {},
+        {});
+
+    LayerAddedResult linear_operator_1 = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{hidden_dim,
                                                        /*use_bias=*/false,
                                                        DataType::FLOAT,
                                                        Activation::RELU,
                                                        std::nullopt}},
                    std::nullopt},
         inputs_layer.outputs,
-        weights_layer.outputs);
-    tensor_guid_t logit_tensor = get_only(linear_operator.outputs);
+        weights_layer_1.outputs);
+
+    LayerAddedResult linear_operator_2 = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim,
+                                                       /*use_bias=*/false,
+                                                       DataType::FLOAT,
+                                                       Activation::RELU,
+                                                       std::nullopt}},
+                   std::nullopt},
+        linear_operator_1.outputs,
+        weights_layer_2.outputs);
+
+    tensor_guid_t logit_tensor = get_only(linear_operator_2.outputs);
 
     RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
         DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
@@ -99,42 +112,57 @@ TEST_SUITE(FF_TEST_SUITE) {
         ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}};
 
     // initialize training backing
-    LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
+    LossAttrs loss_attrs = LossAttrs{
+        NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
     OptimizerAttrs optimizer_attrs =
         OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
                                          /*momentum=*/0.9,
                                          /*nesterov=*/false,
                                          /*weight_decay=*/0.001}};
 
-
     GradientTensorSource gradient_tensor_source;
     OptimizerTensorSource optimizer_tensor_source;
 
     LocalTrainingBacking local_training_backing =
         LocalTrainingBacking{allocator,
-                            allocated_tensors,
-                            gradient_tensor_source,
-                            optimizer_tensor_source,
-                            computation_graph,
-                            runtime_arg_config,
-                          optimizer_attrs};
-
-    // begin training loop                      
-    ModelTrainingInstance model_training_instance = ModelTrainingInstance{
-      allocator, local_training_backing, logit_tensor, label_tensor, loss_attrs, optimizer_attrs
-    };
-
-    int num_epochs = 10;
-    std::vector<GenericTensorAccessorW> loss_values (num_epochs);
+                             allocated_tensors,
+                             gradient_tensor_source,
+                             optimizer_tensor_source,
+                             computation_graph,
+                             runtime_arg_config,
+                             optimizer_attrs};
+
+    // begin training loop
+    ModelTrainingInstance model_training_instance =
+        ModelTrainingInstance{allocator,
+                              local_training_backing,
+                              logit_tensor,
+                              label_tensor,
+                              loss_attrs,
+                              optimizer_attrs};
+
+    int num_epochs = 5;
+    int num_samples = batch_size.unwrap_nonnegative();
+    std::vector<float *> loss_values(num_epochs);
 
     for (int i = 0; i < num_epochs; i++) {
       model_training_instance.forward();
       model_training_instance.backward();
       model_training_instance.update();
-      loss_values[i] = model_training_instance.get_loss_tensor_backing();
+      float *host_loss_ptr = new float[num_samples];
+      model_training_instance.write_loss_tensor_to_host(host_loss_ptr);
+      loss_values[i] = host_loss_ptr;
+    }
+
+    // Assert that each sample in the batch has a lower loss in last epoch than
+    // the first epoch
+    float *first_epoch = loss_values[0];
+    float *last_epoch = loss_values[num_epochs - 1];
+    CHECK(did_loss_decrease(
+        first_epoch, last_epoch, batch_size.unwrap_nonnegative()));
+
+    for (int i = 0; i < num_epochs; i++) {
+      delete[] loss_values[i];
     }
-    
-    // Assert that each sample in the batch has a lower loss in last epoch than the first epoch
-    CHECK(did_loss_decrease(loss_values[0], loss_values[num_epochs - 1]));
   }
 }
\ No newline at end of file
diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc
index e493265f86..c9c5afe04e 100644
--- a/lib/local-execution/test/src/test_local_cost_estimator.cc
+++ b/lib/local-execution/test/src/test_local_cost_estimator.cc
@@ -12,7 +12,7 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("LocalCostEstimator") {
     // local backing initialization
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
 
     RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
         DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
diff --git a/lib/local-execution/test/src/test_loss_functions.cc b/lib/local-execution/test/src/test_loss_functions.cc
index ae76dcccf9..ca2482653b 100644
--- a/lib/local-execution/test/src/test_loss_functions.cc
+++ b/lib/local-execution/test/src/test_loss_functions.cc
@@ -17,7 +17,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("LossFunctions") {
     // initialize runtime
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
diff --git a/lib/local-execution/test/src/test_update.cc b/lib/local-execution/test/src/test_update.cc
index dcd9c025b3..75ba517d1b 100644
--- a/lib/local-execution/test/src/test_update.cc
+++ b/lib/local-execution/test/src/test_update.cc
@@ -15,7 +15,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ExecuteUpdate") {
     // initialize runtime configs
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
 
     Allocator allocator = create_local_cuda_memory_allocator();
     AllocatedTensors allocated_tensors = make_empty_allocated_tensors();
diff --git a/lib/pcg/include/pcg/computation_graph.h b/lib/pcg/include/pcg/computation_graph.h
index efc955ec92..60e825c11a 100644
--- a/lib/pcg/include/pcg/computation_graph.h
+++ b/lib/pcg/include/pcg/computation_graph.h
@@ -24,6 +24,8 @@ LayerAddedResult add_layer(
 
 LayerAddedResult add_input_layer(ComputationGraph &computation_graph,
                                  TensorShape const &tensor_shape);
+LayerAddedResult add_input_layer_with_grad(ComputationGraph &computation_graph,
+                                           TensorShape const &tensor_shape);
 
 TensorAttrs get_tensor_attrs(ComputationGraph const &, tensor_guid_t const &);
 bool are_tensor_guid_shapes_equivalent(ComputationGraph const &cg,
diff --git a/lib/pcg/src/pcg/computation_graph.cc b/lib/pcg/src/pcg/computation_graph.cc
index 200410dd7b..b8917eed35 100644
--- a/lib/pcg/src/pcg/computation_graph.cc
+++ b/lib/pcg/src/pcg/computation_graph.cc
@@ -100,6 +100,20 @@ LayerAddedResult add_input_layer(ComputationGraph &cg,
                    /*outputs=*/std::vector{CreateGrad::NO});
 }
 
+LayerAddedResult add_input_layer_with_grad(ComputationGraph &cg,
+                                           TensorShape const &tensor_shape) {
+  LayerAttrs layer_attrs = LayerAttrs{
+      /*op_attrs=*/ComputationGraphOpAttrs{InputAttrs{tensor_shape}},
+      /*name=*/std::nullopt,
+  };
+
+  return add_layer(cg,
+                   layer_attrs,
+                   /*inputs=*/{},
+                   /*weights=*/{},
+                   /*outputs=*/std::vector{CreateGrad::YES});
+}
+
 TensorAttrs get_tensor_attrs(ComputationGraph const &cg,
                              tensor_guid_t const &t) {
   return cg.raw_graph.at(t.raw_graph_output);
diff --git a/lib/realm-backend/include/realm-backend/model_training_instance.h b/lib/realm-backend/include/realm-backend/model_training_instance.h
index bc9c79dccf..049836d042 100644
--- a/lib/realm-backend/include/realm-backend/model_training_instance.h
+++ b/lib/realm-backend/include/realm-backend/model_training_instance.h
@@ -28,7 +28,7 @@ struct ModelTrainingInstance {
   PerLayerElapsedTime forward();
   PerLayerElapsedTime backward();
   void update();
-  GenericTensorAccessorW get_loss_tensor_backing();
+  void write_loss_tensor_to_host(float *host_ptr);
 };
 
 } // namespace FlexFlow
diff --git a/lib/realm-backend/include/realm-backend/task_wrapper.h b/lib/realm-backend/include/realm-backend/task_wrapper.h
index 8265ca398b..64a360e549 100644
--- a/lib/realm-backend/include/realm-backend/task_wrapper.h
+++ b/lib/realm-backend/include/realm-backend/task_wrapper.h
@@ -25,11 +25,11 @@ void fwdbwd_wrapper_task(const void *args, size_t arglen, const void *userdata,
 void generic_wrapper_task(const void *args, size_t arglen, const void *userdata,
                           size_t userlen, Realm::Processor p);
 
-void register_wrapper_tasks_init(Realm::Processor p, task_id_t task_id);
+void register_wrapper_tasks_init(int p_id, Realm::Processor p, task_id_t task_id);
 
-void register_wrapper_tasks_fwdbwd(Realm::Processor p, task_id_t task_id);
+void register_wrapper_tasks_fwdbwd(int p_id, Realm::Processor p, task_id_t task_id);
 
-void register_wrapper_tasks_generic(Realm::Processor p, task_id_t task_id);
+void register_wrapper_tasks_generic(int p_id, Realm::Processor p, task_id_t task_id);
 
 void register_wrapper_tasks(int pid, Realm::Processor p, task_id_t task_id,
                             TaskSignatureAndImpl task_sig_impl);
diff --git a/lib/realm-backend/src/model_training_instance.cc b/lib/realm-backend/src/model_training_instance.cc
index d420776e42..0c318f8942 100644
--- a/lib/realm-backend/src/model_training_instance.cc
+++ b/lib/realm-backend/src/model_training_instance.cc
@@ -73,14 +73,14 @@ void ModelTrainingInstance::update() {
     this->optimizer_attrs);
 }
 
-GenericTensorAccessorW ModelTrainingInstance::get_loss_tensor_backing() {
+void ModelTrainingInstance::write_loss_tensor_to_host(float *host_ptr) {
   gradient_tensor_t loss_tensor =
       this->training_backing.realm_tensor_backing
           .tensor_gradient_mapping.at(this->logit_tensor);
   GenericTensorAccessorW loss_tensor_backing =
       this->training_backing.realm_tensor_backing.tensor_backings.at(
           TensorTypeVariant{loss_tensor});
-  return loss_tensor_backing;
+  write_to_host_float_ptr(loss_tensor_backing, host_ptr);
 }
 
 } // namespace FlexFlow
diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc
index 3b7eb48823..e2e28e9929 100644
--- a/lib/realm-backend/src/realm_training_backing.cc
+++ b/lib/realm-backend/src/realm_training_backing.cc
@@ -276,7 +276,7 @@ Future<void> execute_update(RealmTrainingBacking &realm_training_backing,
         realm_training_backing.realm_args_backing, invocation,
         realm_training_backing.allocators[0]);
     task_id_t task_id = invocation.task_id;
-    register_wrapper_tasks_generic(realm_training_backing.worker_procs[0],
+    register_wrapper_tasks_generic(0, realm_training_backing.worker_procs[0],
                                    task_id);
     TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs);
     // TODO: multi gpu launching
@@ -311,7 +311,7 @@ Future<void> compute_loss(RealmTrainingBacking &realm_training_backing,
       realm_training_backing.realm_args_backing, loss_invocation,
         realm_training_backing.allocators[0]);
   task_id_t task_id = loss_invocation.task_id;
-  register_wrapper_tasks_generic(realm_training_backing.worker_procs[0],
+  register_wrapper_tasks_generic(0, realm_training_backing.worker_procs[0],
                                  task_id);
   TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl();
   // TODO: multi gpu launching
diff --git a/lib/realm-backend/src/task_wrapper.cc b/lib/realm-backend/src/task_wrapper.cc
index f07f11b60d..1a01fb0a58 100644
--- a/lib/realm-backend/src/task_wrapper.cc
+++ b/lib/realm-backend/src/task_wrapper.cc
@@ -38,21 +38,36 @@ void generic_wrapper_task(const void *args, size_t arglen, const void *userdata,
   fn(task_args.accessor);
 }
 
-void register_wrapper_tasks_init(Processor p, task_id_t task_id) {
+void register_wrapper_tasks_init(int p_id, Processor p, task_id_t task_id) {
+  std::pair<int, task_id_t> key = {p_id, task_id};
+  if (registered_tasks.find(key) != registered_tasks.end()) {
+    return;
+  }
+  registered_tasks.insert(key);
   Processor::register_task_by_kind(
       p.kind(), false /*!global*/, get_realm_task_id(task_id),
       CodeDescriptor(init_wrapper_task), ProfilingRequestSet())
       .external_wait();
 }
 
-void register_wrapper_tasks_fwdbwd(Realm::Processor p, task_id_t task_id) {
+void register_wrapper_tasks_fwdbwd(int p_id, Realm::Processor p, task_id_t task_id) {
+  std::pair<int, task_id_t> key = {p_id, task_id};
+  if (registered_tasks.find(key) != registered_tasks.end()) {
+    return;
+  }
+  registered_tasks.insert(key);
   Processor::register_task_by_kind(
       p.kind(), false /*!global*/, get_realm_task_id(task_id),
       CodeDescriptor(fwdbwd_wrapper_task), ProfilingRequestSet())
       .external_wait();
 }
 
-void register_wrapper_tasks_generic(Realm::Processor p, task_id_t task_id) {
+void register_wrapper_tasks_generic(int p_id, Realm::Processor p, task_id_t task_id) {
+  std::pair<int, task_id_t> key = {p_id, task_id};
+  if (registered_tasks.find(key) != registered_tasks.end()) {
+    return;
+  }
+  registered_tasks.insert(key);
   Processor::register_task_by_kind(
       p.kind(), false /*!global*/, get_realm_task_id(task_id),
       CodeDescriptor(generic_wrapper_task), ProfilingRequestSet())
@@ -61,21 +76,16 @@ void register_wrapper_tasks_generic(Realm::Processor p, task_id_t task_id) {
 
 void register_wrapper_tasks(int p_id, Processor p, task_id_t task_id,
                             TaskSignatureAndImpl task_sig_impl) {
-  std::pair<int, task_id_t> key = {p_id, task_id};
-  if (registered_tasks.find(key) != registered_tasks.end()) {
-    return;
-  }
-  registered_tasks.insert(key);
   switch (task_sig_impl.task_signature.type) {
   case OpTaskType::INIT:
-    register_wrapper_tasks_init(p, task_id);
+    register_wrapper_tasks_init(p_id, p, task_id);
     break;
   case OpTaskType::FWD:
   case OpTaskType::BWD:
-    register_wrapper_tasks_fwdbwd(p, task_id);
+    register_wrapper_tasks_fwdbwd(p_id, p, task_id);
     break;
   default:
-    register_wrapper_tasks_generic(p, task_id);
+    register_wrapper_tasks_generic(p_id, p, task_id);
     break;
   }
 }
diff --git a/lib/realm-backend/test/src/test_e2e.cc b/lib/realm-backend/test/src/test_e2e.cc
index 659b0e5977..ba180494c3 100644
--- a/lib/realm-backend/test/src/test_e2e.cc
+++ b/lib/realm-backend/test/src/test_e2e.cc
@@ -10,136 +10,163 @@
 #include "test_utils.h"
 #include "utils/containers/get_only.h"
 #include "realm-backend/model_training_instance.h"
-#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 using namespace Realm;
 
-bool did_loss_decrease(GenericTensorAccessorW const &first_epoch, GenericTensorAccessorW const & last_epoch) {
-  float* first_epoch_ptr = first_epoch.get_float_ptr();
-  float* last_epoch_ptr = last_epoch.get_float_ptr();
-  
-  int batch_size = first_epoch.shape.at(ff_dim_t{nonnegative_int{0}}).unwrap_nonnegative();
+bool did_loss_decrease(float *first_epoch, float *last_epoch, int batch_size) {
   for (int i = 0; i < batch_size; i++) {
-    if (first_epoch_ptr[i] < last_epoch_ptr[i]) {
+    if (first_epoch[i] < last_epoch[i]) {
       return false;
     }
   }
-
   return true;
 }
 
 void top_level_task(const void *args, size_t arglen, const void *userdata,
                     size_t userlen, Realm::Processor p) {
-    // initialize runtime
-    ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle{};
-    std::vector<Processor> worker_procs;
-    std::vector<Allocator> allocators;
-    Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine())
-                                    .only_kind(Processor::TOC_PROC);
-    assert(pq.count() > 0);
-    for (Processor p : pq) {
-      worker_procs.push_back(p);
-      allocators.push_back(create_realm_memory_allocator(p));
+  // initialize runtime
+  ManagedFFStream managed_stream{};
+  ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
+  std::vector<Processor> worker_procs;
+  std::vector<Allocator> allocators;
+  Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine())
+                                  .only_kind(Processor::TOC_PROC);
+  assert(pq.count() > 0);
+  for (Processor p : pq) {
+    worker_procs.push_back(p);
+    allocators.push_back(create_realm_memory_allocator(p));
+  }
+
+  // allocate label tensors
+  LossTensorSource loss_tensor_source;
+  loss_tensor_t label_tensor = loss_tensor_source.new_loss_tensor();
+
+  nonnegative_int batch_size = 10_n;
+  nonnegative_int data_dim = 16_n;
+  nonnegative_int hidden_dim = 32_n;
+  nonnegative_int output_dim = 1_n;
+
+  TensorShape output_tensor_shape = TensorShape{
+      TensorDims{FFOrdered<nonnegative_int>{batch_size, output_dim}},
+      DataType::FLOAT};
+
+  GenericTensorAccessorW label_tensor_backing =
+      allocators[0].allocate_tensor(output_tensor_shape);
+  AllocatedTensors allocated_tensors = AllocatedTensors{
+      {{TensorTypeVariant{label_tensor}, label_tensor_backing}}, {}, {}};
+
+  // construct computation graph
+  ComputationGraph computation_graph = make_empty_computation_graph();
+
+  TensorShape input_tensor_shape = TensorShape{
+      TensorDims{FFOrdered<nonnegative_int>{batch_size, data_dim}},
+      DataType::FLOAT};
+
+  TensorShape weight_shape_1 = TensorShape{
+      TensorDims{FFOrdered<nonnegative_int>{data_dim, hidden_dim}},
+      DataType::FLOAT};
+  TensorShape weight_shape_2 = TensorShape{
+      TensorDims{FFOrdered<nonnegative_int>{hidden_dim, output_dim}},
+      DataType::FLOAT};
+
+  LayerAddedResult inputs_layer =
+      add_input_layer_with_grad(computation_graph, input_tensor_shape);
+
+  LayerAddedResult weights_layer_1 = add_layer(
+      computation_graph,
+      LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
+                      weight_shape_1, InitializerAttrs{GlorotNormalAttrs{0}}}},
+                  std::nullopt},
+      {},
+      {});
+
+  LayerAddedResult weights_layer_2 = add_layer(
+      computation_graph,
+      LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
+                      weight_shape_2, InitializerAttrs{GlorotNormalAttrs{0}}}},
+                  std::nullopt},
+      {},
+      {});
+
+  LayerAddedResult linear_operator_1 = add_layer(
+      computation_graph,
+      LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{hidden_dim,
+                                                      /*use_bias=*/false,
+                                                      DataType::FLOAT,
+                                                      Activation::RELU,
+                                                      std::nullopt}},
+                  std::nullopt},
+      inputs_layer.outputs,
+      weights_layer_1.outputs);
+
+  LayerAddedResult linear_operator_2 = add_layer(
+      computation_graph,
+      LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim,
+                                                      /*use_bias=*/false,
+                                                      DataType::FLOAT,
+                                                      Activation::RELU,
+                                                      std::nullopt}},
+                  std::nullopt},
+      linear_operator_1.outputs,
+      weights_layer_2.outputs);
+
+  tensor_guid_t logit_tensor = get_only(linear_operator_2.outputs);
+
+  RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
+      DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
+      EnableProfiling::YES,
+      ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}};
+
+  // initialize training backing
+  LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
+  OptimizerAttrs optimizer_attrs =
+      OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                        /*momentum=*/0.9,
+                                        /*nesterov=*/false,
+                                        /*weight_decay=*/0.001}};
+
+
+  GradientTensorSource gradient_tensor_source;
+  OptimizerTensorSource optimizer_tensor_source;
+
+  {
+    printf("\nRunning test %d: E2ETest...\n", 1);
+    RealmTrainingBacking realm_training_backing = RealmTrainingBacking(
+        p, worker_procs, allocators, allocated_tensors, gradient_tensor_source,
+        optimizer_tensor_source, computation_graph, runtime_arg_config,
+        optimizer_attrs);
+    // begin training loop                      
+    ModelTrainingInstance model_training_instance = ModelTrainingInstance{
+      realm_training_backing, logit_tensor, label_tensor, loss_attrs, optimizer_attrs
+    };
+
+    int num_epochs = 5;
+    int num_samples = batch_size.unwrap_nonnegative();
+    std::vector<float *> loss_values(num_epochs);
+
+    for (int i = 0; i < num_epochs; i++) {
+      model_training_instance.forward();
+      model_training_instance.backward();
+      model_training_instance.update();
+      float *host_loss_ptr = new float[num_samples];
+      model_training_instance.write_loss_tensor_to_host(host_loss_ptr);
+      loss_values[i] = host_loss_ptr;
     }
 
-    // allocate label tensors
-    LossTensorSource loss_tensor_source;
-    loss_tensor_t label_tensor =
-        loss_tensor_source.new_loss_tensor();
-
-    nonnegative_int batch_size = 10_n;
-    nonnegative_int data_dim = 16_n;
-    nonnegative_int output_dim = 32_n;
-
-    TensorShape output_tensor_shape = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{batch_size, output_dim}},
-        DataType::FLOAT};
-
-    GenericTensorAccessorW label_tensor_backing =
-        allocators[0].allocate_tensor(output_tensor_shape);
-    AllocatedTensors allocated_tensors = AllocatedTensors{
-        {
-         {TensorTypeVariant{label_tensor},
-         label_tensor_backing}},
-        {},
-        {}};
-
-    // construct computation graph
-    ComputationGraph computation_graph = make_empty_computation_graph();
-
-    TensorShape input_tensor_shape = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{batch_size, data_dim}},
-        DataType::FLOAT};
-
-    TensorShape weight_shape = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{data_dim, output_dim}},
-        DataType::FLOAT};
-
-    LayerAddedResult inputs_layer =
-        add_input_layer(computation_graph, input_tensor_shape);
-
-    LayerAddedResult weights_layer = add_layer(
-        computation_graph,
-        LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
-                       weight_shape, InitializerAttrs{GlorotNormalAttrs{0}}}},
-                   std::nullopt},
-        {},
-        {});
-
-    LayerAddedResult linear_operator = add_layer(
-        computation_graph,
-        LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim,
-                                                       /*use_bias=*/false,
-                                                       DataType::FLOAT,
-                                                       Activation::RELU,
-                                                       std::nullopt}},
-                   std::nullopt},
-        inputs_layer.outputs,
-        weights_layer.outputs);
-    tensor_guid_t logit_tensor = get_only(linear_operator.outputs);
-
-    RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
-        DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
-        EnableProfiling::YES,
-        ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}};
-
-    // initialize training backing
-    LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
-    OptimizerAttrs optimizer_attrs =
-        OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
-                                         /*momentum=*/0.9,
-                                         /*nesterov=*/false,
-                                         /*weight_decay=*/0.001}};
-
-
-    GradientTensorSource gradient_tensor_source;
-    OptimizerTensorSource optimizer_tensor_source;
-
-    {
-      printf("\nRunning test %d: E2ETest...\n", 1);
-      RealmTrainingBacking realm_training_backing = RealmTrainingBacking(
-          p, worker_procs, allocators, allocated_tensors, gradient_tensor_source,
-          optimizer_tensor_source, computation_graph, runtime_arg_config,
-          optimizer_attrs);
-      // begin training loop                      
-      ModelTrainingInstance model_training_instance = ModelTrainingInstance{
-        realm_training_backing, logit_tensor, label_tensor, loss_attrs, optimizer_attrs
-      };
-
-      int num_epochs = 10;
-      std::vector<GenericTensorAccessorW> loss_values (num_epochs);
-
-      for (int i = 0; i < num_epochs; i++) {
-        model_training_instance.forward();
-        model_training_instance.backward();
-        model_training_instance.update();
-        loss_values[i] = model_training_instance.get_loss_tensor_backing();
-      }
-      // Assert that each sample in the batch has a lower loss in last epoch than the first epoch
-      CHECK(did_loss_decrease(loss_values[0], loss_values[num_epochs - 1]));
+    // Assert that each sample in the batch has a lower loss in last epoch than
+    // the first epoch
+    float *first_epoch = loss_values[0];
+    float *last_epoch = loss_values[num_epochs - 1];
+    if(did_loss_decrease(
+        first_epoch, last_epoch, batch_size.unwrap_nonnegative())) {
       printf("passed\n");
+    } else {
+      printf("failed\n");
+    }
+
+    for (int i = 0; i < num_epochs; i++) {
+      delete[] loss_values[i];
     }
   }
 }
\ No newline at end of file
diff --git a/lib/realm-backend/test/src/test_update.cc b/lib/realm-backend/test/src/test_update.cc
index 0b332d1ccc..b1f6bebe74 100644
--- a/lib/realm-backend/test/src/test_update.cc
+++ b/lib/realm-backend/test/src/test_update.cc
@@ -16,7 +16,7 @@ void top_level_task(const void *args, size_t arglen, const void *userdata,
                     size_t userlen, Realm::Processor p) {
   // initialize runtime configs
   ManagedFFStream managed_stream{};
-  ManagedPerDeviceFFHandle managed_handle{};
+  ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
   std::vector<Processor> worker_procs;
   std::vector<Allocator> allocators;
   Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine())

From 7755b9488a0858fa802bcc5b72d5588a0168400b Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 28 May 2025 02:45:44 -0700
Subject: [PATCH 81/91] fix: TaskArgumentAccessor has an share_ptr object,
 which need to be handled across threads.

---
 .../src/realm_training_backing.cc             | 33 ++++++++++-------
 lib/realm-backend/src/task_wrapper.cc         | 37 ++++++++++++-------
 2 files changed, 42 insertions(+), 28 deletions(-)

diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc
index e2e28e9929..4e36bf8d5c 100644
--- a/lib/realm-backend/src/realm_training_backing.cc
+++ b/lib/realm-backend/src/realm_training_backing.cc
@@ -137,11 +137,12 @@ initialize_args_backing(RealmTrainingBacking *backing,
       // TODO: multi gpu launching
       Promise<DeviceSpecificDeviceStates> promise = Promise<DeviceSpecificDeviceStates>();
       Future<DeviceSpecificDeviceStates> future = promise.get_future();
-      RealmTaskArgs<DeviceSpecificDeviceStates> args{
+      RealmTaskArgs<DeviceSpecificDeviceStates>* task_arg = new RealmTaskArgs<DeviceSpecificDeviceStates>{
           task_id, impl_function, accessor, std::move(promise)};
+      uintptr_t args[1] = {reinterpret_cast<uintptr_t>(task_arg)};
       Event e =
           worker_procs[0].spawn(get_realm_task_id(task_id),
-                                &args, sizeof(args), worker_events[0]);
+                                args, sizeof(uintptr_t), worker_events[0]);
       worker_events[0] = e;
       future.set_event(e);
       per_device_op_states.insert({node, future.get().value()});
@@ -185,10 +186,11 @@ execute_forward(RealmTrainingBacking &realm_training_backing,
     // TODO: multi gpu launching
     Promise<float> promise(realm_training_backing.master_mem);
     Future<float> future = promise.get_future();
-    RealmTaskArgs<float> args{task_id, impl_function, accessor,
-                                std::move(promise)};
+    RealmTaskArgs<float>* task_arg = new RealmTaskArgs<float>{task_id, impl_function, accessor,
+                                        std::move(promise)};
+    uintptr_t args[1] = {reinterpret_cast<uintptr_t>(task_arg)};
     Event e = realm_training_backing.worker_procs[0].spawn(
-        get_realm_task_id(task_id), &args, sizeof(args),
+        get_realm_task_id(task_id), args, sizeof(uintptr_t),
         realm_training_backing.worker_events[0]);
     realm_training_backing.worker_events[0] = e;
     future.set_event(e);
@@ -232,10 +234,11 @@ execute_backward(RealmTrainingBacking &realm_training_backing,
     // TODO: multi gpu launching
     Promise<float> promise(realm_training_backing.master_mem);
     Future<float> future = promise.get_future();
-    RealmTaskArgs<float> args{task_id, impl_function, accessor,
-                                std::move(promise)};
+    RealmTaskArgs<float>* task_arg = new RealmTaskArgs<float>{task_id, impl_function, accessor,
+                                        std::move(promise)};
+    uintptr_t args[1] = {reinterpret_cast<uintptr_t>(task_arg)};
     Event e = realm_training_backing.worker_procs[0].spawn(
-        get_realm_task_id(task_id), &args, sizeof(args),
+        get_realm_task_id(task_id), args, sizeof(uintptr_t),
         realm_training_backing.worker_events[0]);
     realm_training_backing.worker_events[0] = e;
     future.set_event(e);
@@ -282,10 +285,11 @@ Future<void> execute_update(RealmTrainingBacking &realm_training_backing,
     // TODO: multi gpu launching
     Promise<void> promise;
     Future<void> future = promise.get_future();
-    RealmTaskArgs<void> args{task_id, update_impl_fn, accessor,
-                             std::move(promise)};
+    RealmTaskArgs<void>* task_arg = new RealmTaskArgs<void>{task_id, update_impl_fn, accessor,
+                                        std::move(promise)};
+    uintptr_t args[1] = {reinterpret_cast<uintptr_t>(task_arg)};
     Event e = realm_training_backing.worker_procs[0].spawn(
-        get_realm_task_id(task_id), &args, sizeof(args),
+        get_realm_task_id(task_id), args, sizeof(uintptr_t),
         realm_training_backing.worker_events[0]);
     realm_training_backing.worker_events[0] = e;
     future.set_event(e);
@@ -317,10 +321,11 @@ Future<void> compute_loss(RealmTrainingBacking &realm_training_backing,
   // TODO: multi gpu launching
   Promise<void> promise;
   Future<void> future = promise.get_future();
-  RealmTaskArgs<void> args{task_id, loss_impl_fn, loss_accessor,
-                           std::move(promise)};
+  RealmTaskArgs<void>* task_arg = new RealmTaskArgs<void>{task_id, loss_impl_fn, loss_accessor,
+                                        std::move(promise)};
+  uintptr_t args[1] = {reinterpret_cast<uintptr_t>(task_arg)};
   Event e = realm_training_backing.worker_procs[0].spawn(
-      get_realm_task_id(task_id), &args, sizeof(args),
+      get_realm_task_id(task_id), args, sizeof(uintptr_t),
       realm_training_backing.worker_events[0]);
   realm_training_backing.worker_events[0] = e;
   future.set_event(e);
diff --git a/lib/realm-backend/src/task_wrapper.cc b/lib/realm-backend/src/task_wrapper.cc
index 1a01fb0a58..cb220f44dc 100644
--- a/lib/realm-backend/src/task_wrapper.cc
+++ b/lib/realm-backend/src/task_wrapper.cc
@@ -11,31 +11,40 @@ std::unordered_set<std::pair<int, task_id_t>> registered_tasks;
 
 void init_wrapper_task(const void *args, size_t arglen, const void *userdata,
                        size_t userlen, Processor p) {
-  RealmTaskArgs<DeviceSpecificDeviceStates> const &task_args =
-      *reinterpret_cast<const RealmTaskArgs<DeviceSpecificDeviceStates> *>(args);
+  assert(arglen == sizeof(uintptr_t));
+  uintptr_t task_arg_ptr = *reinterpret_cast<const uintptr_t *>(args);
+  RealmTaskArgs<DeviceSpecificDeviceStates> *task_args =
+      reinterpret_cast<RealmTaskArgs<DeviceSpecificDeviceStates> *>(task_arg_ptr);
   auto fn =
-      task_args.impl_function.get<InitOpTaskImplFunction>().function_ptr;
-  DeviceSpecificDeviceStates result = fn(task_args.accessor);
-  task_args.promise.set_value(result);
+      task_args->impl_function.get<InitOpTaskImplFunction>().function_ptr;
+  DeviceSpecificDeviceStates result = fn(task_args->accessor);
+  task_args->promise.set_value(result);
+  delete task_args;
 }
 
 void fwdbwd_wrapper_task(const void *args, size_t arglen, const void *userdata,
                          size_t userlen, Processor p) {
-  RealmTaskArgs<float> const &task_args =
-      *reinterpret_cast<const RealmTaskArgs<float> *>(args);
+  assert(arglen == sizeof(uintptr_t));
+  uintptr_t task_arg_ptr = *reinterpret_cast<const uintptr_t *>(args);
+  RealmTaskArgs<float> *task_args =
+      reinterpret_cast<RealmTaskArgs<float> *>(task_arg_ptr);
   auto fn =
-      task_args.impl_function.get<FwdBwdOpTaskImplFunction>().function_ptr;
-  std::optional<float> result = fn(task_args.accessor);
-  task_args.promise.set_value(result.has_value() ? result.value() : 0.0f);
+      task_args->impl_function.get<FwdBwdOpTaskImplFunction>().function_ptr;
+  std::optional<float> result = fn(task_args->accessor);
+  task_args->promise.set_value(result.has_value() ? result.value() : 0.0f);
+  delete task_args;
 }
 
 void generic_wrapper_task(const void *args, size_t arglen, const void *userdata,
                           size_t userlen, Processor p) {
-  RealmTaskArgs<void> const &task_args =
-      *reinterpret_cast<const RealmTaskArgs<void> *>(args);
+  assert(arglen == sizeof(uintptr_t));
+  uintptr_t task_arg_ptr = *reinterpret_cast<const uintptr_t *>(args);
+  RealmTaskArgs<void> *task_args =
+      reinterpret_cast<RealmTaskArgs<void> *>(task_arg_ptr);
   auto fn =
-      task_args.impl_function.get<GenericTaskImplFunction>().function_ptr;
-  fn(task_args.accessor);
+      task_args->impl_function.get<GenericTaskImplFunction>().function_ptr;
+  fn(task_args->accessor);
+  delete task_args;
 }
 
 void register_wrapper_tasks_init(int p_id, Processor p, task_id_t task_id) {

From 335ac6d5abb2a67efb86363aefb38986ce181b1a Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Wed, 11 Jun 2025 08:47:18 -0700
Subject: [PATCH 82/91] Expose test kernels, fill weights

---
 .../src/kernels/format_accessor_contents.cc   |  8 +--
 .../test/src/cpu/ops/replicate_kernels.cc     |  2 +-
 .../test/src/cpu/ops/reverse_kernels.cc       |  2 +-
 lib/kernels/test/src/kernels/accessor.cc      |  2 +-
 .../src/kernels/compare_tensor_accessors.cc   |  2 +-
 .../src/kernels/format_accessor_contents.cc   |  2 +-
 .../src/kernels/reduce_tensor_accessor.cc     |  2 +-
 lib/kernels/test/src/test_attention_kernel.cc |  2 +-
 .../test/src/test_batch_matmul_kernel.cc      |  2 +-
 .../test/src/test_batch_norm_kernel.cc        |  2 +-
 lib/kernels/test/src/test_cast_kernel.cc      |  2 +-
 lib/kernels/test/src/test_combine_kernel.cc   |  2 +-
 lib/kernels/test/src/test_concat_kernel.cc    |  2 +-
 lib/kernels/test/src/test_cuda.cc             |  2 +-
 lib/kernels/test/src/test_dropout.cc          |  2 +-
 lib/kernels/test/src/test_flat_kernel.cc      |  2 +-
 lib/kernels/test/src/test_gather_kernels.cc   |  2 +-
 .../test/src/test_layer_norm_kernels.cc       |  2 +-
 .../test/src/test_managed_ff_stream.cc        |  2 +-
 lib/kernels/test/src/test_partition_kernel.cc |  2 +-
 lib/kernels/test/src/test_pool_2d_kernels.cc  |  2 +-
 lib/kernels/test/src/test_reduction_kernel.cc |  2 +-
 lib/kernels/test/src/test_replicate_kernel.cc |  8 +--
 lib/kernels/test/src/test_reshape_kernel.cc   |  2 +-
 lib/kernels/test/src/test_reverse_kernels.cc  |  2 +-
 lib/kernels/test/src/test_softmax_kernel.cc   |  2 +-
 lib/kernels/test/src/test_split_kernel.cc     |  2 +-
 lib/kernels/test/src/test_transpose_kernel.cc |  2 +-
 lib/local-execution/src/loss_functions.cc     | 14 ++++-
 .../src/model_training_instance.cc            | 25 +++++++++
 .../src/unallocated_tensors.cc                |  1 -
 lib/local-execution/test/src/test_e2e.cc      | 52 ++++++++++++-------
 lib/task-spec/src/task-spec/ops/linear.cc     | 29 ++++++++++-
 33 files changed, 132 insertions(+), 57 deletions(-)

diff --git a/lib/kernels/src/kernels/format_accessor_contents.cc b/lib/kernels/src/kernels/format_accessor_contents.cc
index ed54b21cfd..d40e5c4268 100644
--- a/lib/kernels/src/kernels/format_accessor_contents.cc
+++ b/lib/kernels/src/kernels/format_accessor_contents.cc
@@ -161,14 +161,14 @@ std::string format_accessor_r_contents(GenericTensorAccessorR const &accessor) {
   GenericTensorAccessorR cpu_accessor =
       copy_tensor_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator);
 
-  int num_dims = accessor.shape.num_dims().unwrap_nonnegative();
+  int num_dims = cpu_accessor.shape.num_dims().unwrap_nonnegative();
   switch (num_dims) {
     case 1:
-      return format_1d_accessor_r_contents(accessor);
+      return format_1d_accessor_r_contents(cpu_accessor);
     case 2:
-      return format_2d_accessor_r_contents(accessor);
+      return format_2d_accessor_r_contents(cpu_accessor);
     case 3:
-      return format_3d_accessor_r_contents(accessor);
+      return format_3d_accessor_r_contents(cpu_accessor);
     default:
       PANIC("Unhandled accessor dimensionality", num_dims);
   }
diff --git a/lib/kernels/test/src/cpu/ops/replicate_kernels.cc b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc
index b98b1745d5..1984fd5f83 100644
--- a/lib/kernels/test/src/cpu/ops/replicate_kernels.cc
+++ b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc
@@ -1,4 +1,4 @@
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include "kernels/create_accessor_with_contents.h"
 #include "kernels/format_accessor_contents.h"
 #include "kernels/replicate_kernels_cpu.h"
diff --git a/lib/kernels/test/src/cpu/ops/reverse_kernels.cc b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc
index 51025cd17b..5e27b9d350 100644
--- a/lib/kernels/test/src/cpu/ops/reverse_kernels.cc
+++ b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc
@@ -1,4 +1,4 @@
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include "kernels/create_accessor_with_contents.h"
 #include "kernels/format_accessor_contents.h"
 #include "kernels/reverse_kernels_cpu.h"
diff --git a/lib/kernels/test/src/kernels/accessor.cc b/lib/kernels/test/src/kernels/accessor.cc
index 45e83cc0c6..31a6cba205 100644
--- a/lib/kernels/test/src/kernels/accessor.cc
+++ b/lib/kernels/test/src/kernels/accessor.cc
@@ -1,5 +1,5 @@
 #include "kernels/accessor.h"
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include "kernels/create_accessor_with_contents.h"
 #include "kernels/local_cpu_allocator.h"
 #include <doctest/doctest.h>
diff --git a/lib/kernels/test/src/kernels/compare_tensor_accessors.cc b/lib/kernels/test/src/kernels/compare_tensor_accessors.cc
index 85ffa91315..4e85dfdaa0 100644
--- a/lib/kernels/test/src/kernels/compare_tensor_accessors.cc
+++ b/lib/kernels/test/src/kernels/compare_tensor_accessors.cc
@@ -1,5 +1,5 @@
 #include "kernels/compare_tensor_accessors.h"
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include "kernels/create_accessor_with_contents.h"
 #include "kernels/format_accessor_contents.h"
 #include "test/utils/doctest/check_kv.h"
diff --git a/lib/kernels/test/src/kernels/format_accessor_contents.cc b/lib/kernels/test/src/kernels/format_accessor_contents.cc
index f515f2495b..a2b61b8dff 100644
--- a/lib/kernels/test/src/kernels/format_accessor_contents.cc
+++ b/lib/kernels/test/src/kernels/format_accessor_contents.cc
@@ -1,5 +1,5 @@
 #include "kernels/format_accessor_contents.h"
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include "kernels/create_accessor_with_contents.h"
 #include "kernels/local_cpu_allocator.h"
 #include <doctest/doctest.h>
diff --git a/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc b/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc
index a269cf4777..dd5f8e06f6 100644
--- a/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc
+++ b/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc
@@ -1,5 +1,5 @@
 #include "kernels/reduce_tensor_accessor.h"
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include "kernels/create_accessor_with_contents.h"
 #include "kernels/format_accessor_contents.h"
 #include "test/utils/doctest/check_kv.h"
diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc
index f80c080f11..a086974a74 100644
--- a/lib/kernels/test/src/test_attention_kernel.cc
+++ b/lib/kernels/test/src/test_attention_kernel.cc
@@ -1,4 +1,4 @@
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include "kernels/attention_kernels.h"
 #include <doctest/doctest.h>
 
diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc
index dd98a36094..b0fe356c95 100644
--- a/lib/kernels/test/src/test_batch_matmul_kernel.cc
+++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc
@@ -1,4 +1,4 @@
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include "kernels/batch_matmul_kernels.h"
 #include <doctest/doctest.h>
 
diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc
index 534901daf2..c173fd6d24 100644
--- a/lib/kernels/test/src/test_batch_norm_kernel.cc
+++ b/lib/kernels/test/src/test_batch_norm_kernel.cc
@@ -1,4 +1,4 @@
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include "kernels/batch_norm_kernels.h"
 #include "op-attrs/datatype_value.h"
 #include <doctest/doctest.h>
diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc
index 7539b2457c..9472e44a15 100644
--- a/lib/kernels/test/src/test_cast_kernel.cc
+++ b/lib/kernels/test/src/test_cast_kernel.cc
@@ -1,4 +1,4 @@
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include "kernels/cast_kernels.h"
 #include "kernels/cast_kernels_cpu.h"
 #include <doctest/doctest.h>
diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc
index f3a2a8153d..7ac4d0f881 100644
--- a/lib/kernels/test/src/test_combine_kernel.cc
+++ b/lib/kernels/test/src/test_combine_kernel.cc
@@ -1,4 +1,4 @@
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include "kernels/combine_kernels.h"
 #include "kernels/combine_kernels_cpu.h"
 #include <doctest/doctest.h>
diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc
index 397b5cdf90..5dc8e441bd 100644
--- a/lib/kernels/test/src/test_concat_kernel.cc
+++ b/lib/kernels/test/src/test_concat_kernel.cc
@@ -1,4 +1,4 @@
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include "kernels/concat_kernels.h"
 #include "utils/containers/repeat.h"
 #include <doctest/doctest.h>
diff --git a/lib/kernels/test/src/test_cuda.cc b/lib/kernels/test/src/test_cuda.cc
index de3215cf2d..60bc6251b2 100644
--- a/lib/kernels/test/src/test_cuda.cc
+++ b/lib/kernels/test/src/test_cuda.cc
@@ -1,4 +1,4 @@
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include <doctest/doctest.h>
 
 #include <random>
diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc
index c4518293dd..fb8b8dc87c 100644
--- a/lib/kernels/test/src/test_dropout.cc
+++ b/lib/kernels/test/src/test_dropout.cc
@@ -1,4 +1,4 @@
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include "kernels/dropout_kernels.h"
 #include "utils/containers/count.h"
 #include <doctest/doctest.h>
diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc
index 14930e280b..cea07ce781 100644
--- a/lib/kernels/test/src/test_flat_kernel.cc
+++ b/lib/kernels/test/src/test_flat_kernel.cc
@@ -1,4 +1,4 @@
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include "kernels/flat_kernels.h"
 #include "op-attrs/datatype_value.h"
 #include <doctest/doctest.h>
diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc
index 365fd3fb81..6a553bd107 100644
--- a/lib/kernels/test/src/test_gather_kernels.cc
+++ b/lib/kernels/test/src/test_gather_kernels.cc
@@ -1,4 +1,4 @@
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include "kernels/gather_kernels.h"
 #include <doctest/doctest.h>
 
diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc
index 3e63294e78..5382bb3a84 100644
--- a/lib/kernels/test/src/test_layer_norm_kernels.cc
+++ b/lib/kernels/test/src/test_layer_norm_kernels.cc
@@ -1,4 +1,4 @@
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include "kernels/layer_norm_kernels.h"
 #include "op-attrs/datatype_value.h"
 #include <doctest/doctest.h>
diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc
index ed2d8dc2b6..25a346446b 100644
--- a/lib/kernels/test/src/test_managed_ff_stream.cc
+++ b/lib/kernels/test/src/test_managed_ff_stream.cc
@@ -1,4 +1,4 @@
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include "kernels/gather_kernels.h"
 #include <doctest/doctest.h>
 
diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc
index 40a9eead53..c042ae3175 100644
--- a/lib/kernels/test/src/test_partition_kernel.cc
+++ b/lib/kernels/test/src/test_partition_kernel.cc
@@ -1,4 +1,4 @@
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include "kernels/partition_kernels.h"
 #include "op-attrs/datatype_value.h"
 #include <doctest/doctest.h>
diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc
index a999311b81..58fff5c884 100644
--- a/lib/kernels/test/src/test_pool_2d_kernels.cc
+++ b/lib/kernels/test/src/test_pool_2d_kernels.cc
@@ -1,4 +1,4 @@
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include "kernels/pool_2d_kernels.h"
 #include "op-attrs/datatype_value.h"
 #include <doctest/doctest.h>
diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc
index e2c4c36a71..4d030c4d93 100644
--- a/lib/kernels/test/src/test_reduction_kernel.cc
+++ b/lib/kernels/test/src/test_reduction_kernel.cc
@@ -1,4 +1,4 @@
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include "kernels/reduction_kernels.h"
 #include "op-attrs/datatype_value.h"
 #include <doctest/doctest.h>
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index 5f58239a31..9806cefe8d 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -1,4 +1,4 @@
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include "kernels/create_accessor_with_contents.h"
 #include "kernels/format_accessor_contents.h"
 #include "kernels/replicate_kernels.h"
@@ -31,7 +31,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input =
-          create_1d_accessor_r_with_contents<int32_t>({1, 3, 2}, gpu_allocator);
+          create_1d_accessor_r_with_contents<float>({1, 3, 2}, gpu_allocator);
 
       GenericTensorAccessorW output =
           gpu_allocator.allocate_tensor(output_shape);
@@ -47,7 +47,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     SUBCASE("backward_kernel") {
       GenericTensorAccessorR output_grad =
-          create_2d_accessor_r_with_contents<int32_t>(
+          create_2d_accessor_r_with_contents<float>(
               {
                   {1, 2, 3},
                   {4, 3, 3},
@@ -56,7 +56,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
               gpu_allocator);
 
       GenericTensorAccessorR correct =
-          create_1d_accessor_r_with_contents<int32_t>(
+          create_1d_accessor_r_with_contents<float>(
               {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator);
 
       GenericTensorAccessorW input_grad =
diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc
index 066db28a17..011f35e567 100644
--- a/lib/kernels/test/src/test_reshape_kernel.cc
+++ b/lib/kernels/test/src/test_reshape_kernel.cc
@@ -1,4 +1,4 @@
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include "kernels/reshape_kernels.h"
 #include <doctest/doctest.h>
 
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index 6a0ad84a92..fc5c8deaad 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -1,4 +1,4 @@
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include "kernels/reverse_kernels.h"
 #include "kernels/reverse_kernels_cpu.h"
 #include "op-attrs/datatype_value.h"
diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc
index bf10b5c633..bb449f6755 100644
--- a/lib/kernels/test/src/test_softmax_kernel.cc
+++ b/lib/kernels/test/src/test_softmax_kernel.cc
@@ -1,4 +1,4 @@
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include "kernels/softmax_kernels.h"
 #include <doctest/doctest.h>
 
diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc
index 1c1c4d4d51..2597db95e0 100644
--- a/lib/kernels/test/src/test_split_kernel.cc
+++ b/lib/kernels/test/src/test_split_kernel.cc
@@ -1,4 +1,4 @@
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include "kernels/split_kernels.h"
 #include "op-attrs/datatype_value.h"
 #include "utils/containers/repeat.h"
diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc
index 8560d33e5b..c0b2d4db5e 100644
--- a/lib/kernels/test/src/test_transpose_kernel.cc
+++ b/lib/kernels/test/src/test_transpose_kernel.cc
@@ -1,4 +1,4 @@
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include "kernels/transpose_kernels.h"
 #include <doctest/doctest.h>
 
diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc
index c23159a85d..99225b1895 100644
--- a/lib/local-execution/src/loss_functions.cc
+++ b/lib/local-execution/src/loss_functions.cc
@@ -16,6 +16,7 @@
 #include "op-attrs/ops/loss_functions.h"
 #include "kernels/loss_function_kernels.h"
 #include "local-execution/loss_functions.h"
+#include "kernels/format_accessor_contents.h"
 #include "task-spec/profiling.h"
 #include "utils/nonnegative_int/nonnegative_int.h"
 
@@ -55,6 +56,7 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
   auto logit_grad = acc.get_tensor_grad<Permissions::RW>(LOGIT_GRAD);
   auto logit = acc.get_tensor<Permissions::RO>(LOGIT);
   auto label = acc.get_loss_tensor<Permissions::RO>(LABEL);
+
   int batch_size = logit.shape.at(legion_dim_t{1_n}).int_from_positive_int();
   // assuming logit shape is [batch dim, num classes]
 
@@ -109,15 +111,23 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
         logit.shape.at(legion_dim_t{0_n}).int_from_positive_int();
     switch (loss_type) {
       case LossFunction::CATEGORICAL_CROSSENTROPY: {
+        size_t logit_volume = get_num_elements(logit.shape).int_from_positive_int();
+        size_t logit_grad_volume =
+            get_num_elements(logit_grad.shape).int_from_positive_int();
+
         profile(categorical_crossentropy_loss_backward_kernel,
                 profiling,
                 "[CategoricalCrossEntropyLoss] backward_time = %.2lfms\n",
                 get_float_ptr(logit_grad),
                 get_float_ptr(logit),
                 get_float_ptr(label),
-                get_num_elements(logit.shape).int_from_positive_int(),
-                get_num_elements(logit_grad.shape).int_from_positive_int(),
+                logit_volume,
+                logit_grad_volume,
                 scale_factor);
+
+        
+        std::cout << "Logit grad (loss) tensor after computation" << std::endl;
+        std::cout << format_accessor_w_contents(logit_grad) << std::endl;
         break;
       }
       case LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE: {
diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc
index 790c5e8e18..847b1679ac 100644
--- a/lib/local-execution/src/model_training_instance.cc
+++ b/lib/local-execution/src/model_training_instance.cc
@@ -1,4 +1,5 @@
 #include "local-execution/model_training_instance.h"
+#include "kernels/format_accessor_contents.h"
 #include "pcg/computation_graph.h"
 #include "pcg/optimizer_attrs.h"
 #include "utils/containers/reversed.h"
@@ -34,6 +35,17 @@ PerLayerElapsedTime ModelTrainingInstance::backward() {
                this->label_tensor,
                this->allocator);
 
+  std::cout << "Done computing loss" << std::endl;
+  gradient_tensor_t loss_tensor =
+      this->training_backing.local_tensor_backing.tensor_gradient_mapping.at(
+          this->logit_tensor);
+  GenericTensorAccessorW loss_tensor_backing =
+      this->training_backing.local_tensor_backing.tensor_backings.at(
+          TensorTypeVariant{loss_tensor});
+  
+  std::cout << "Loss (logit grad) tensor" << std::endl;
+  std::cout << format_accessor_w_contents(loss_tensor_backing) << std::endl;
+
   PerLayerElapsedTime per_layer_elapsed_time;
   for (layer_guid_t const &node : reversed(
            topological_ordering(this->training_backing.computation_graph))) {
@@ -55,12 +67,25 @@ void ModelTrainingInstance::update() {
 }
 
 GenericTensorAccessorR ModelTrainingInstance::get_loss_tensor_accessor() const {
+  GenericTensorAccessorW logit_tensor_backing = this->training_backing
+      .local_tensor_backing.tensor_backings.at(TensorTypeVariant{this->logit_tensor});
+
+  // for (auto const &pair :
+  //      this->training_backing.local_tensor_backing.tensor_backings) {
+  //   std::cout << "Tensor type: " << pair.first << std::endl;
+  //   std::cout << "Tensor " << std::endl;
+  //   std::cout << format_accessor_w_contents(pair.second) << std::endl;
+  // }
+
   gradient_tensor_t loss_tensor =
       this->training_backing.local_tensor_backing.tensor_gradient_mapping.at(
           this->logit_tensor);
   GenericTensorAccessorW loss_tensor_backing =
       this->training_backing.local_tensor_backing.tensor_backings.at(
           TensorTypeVariant{loss_tensor});
+  
+  std::cout << "Loss (logit grad) tensor" << std::endl;
+  std::cout << format_accessor_w_contents(loss_tensor_backing) << std::endl;
   return read_only_accessor_from_write_accessor(loss_tensor_backing);
 }
 
diff --git a/lib/local-execution/src/unallocated_tensors.cc b/lib/local-execution/src/unallocated_tensors.cc
index 363d1eedef..b8daa90e3b 100644
--- a/lib/local-execution/src/unallocated_tensors.cc
+++ b/lib/local-execution/src/unallocated_tensors.cc
@@ -70,7 +70,6 @@ UnallocatedTensors generate_unallocated_tensors_with_optimizer(
         num_optimizer_tensors_to_allocate -=
             allocated_tensors.optimizer_mapping.at(tensor_guid).size();
       }
-      std::cout << num_optimizer_tensors_to_allocate;
 
       for (int i = 0; i < num_optimizer_tensors_to_allocate; ++i) {
         optimizer_tensor_t optimizer_tensor =
diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc
index 8827e0269d..80b2e6a398 100644
--- a/lib/local-execution/test/src/test_e2e.cc
+++ b/lib/local-execution/test/src/test_e2e.cc
@@ -1,10 +1,7 @@
 #include "kernels/compare_tensor_accessors.h"
-#include "kernels/copy_tensor_accessor.h"
-#include "kernels/local_cpu_allocator.h"
-#include "kernels/local_cuda_allocator.h"
-#include "kernels/managed_ff_stream.h"
-#include "kernels/managed_per_device_ff_handle.h"
+#include "kernels/format_accessor_contents.h"
 #include "kernels/tensor_accessor_reductions.h"
+#include "kernels/test_utils.h"
 #include "local-execution/allocated_tensors.h"
 #include "local-execution/local_training_backing.h"
 #include "local-execution/model_training_instance.h"
@@ -45,32 +42,33 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     positive_int hidden_dim = 32_p;
     positive_int output_dim = 1_p;
 
+    TensorShape input_tensor_shape = TensorShape{
+        TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
     TensorShape output_tensor_shape = TensorShape{
         TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
 
-    GenericTensorAccessorW label_tensor_backing =
-        allocator.allocate_tensor(output_tensor_shape);
-    AllocatedTensors allocated_tensors = AllocatedTensors{
-        /*tensor_type_backings=*/{
-            {TensorTypeVariant{label_tensor}, label_tensor_backing},
-        },
-        /*gradient_mapping=*/{},
-        /*optimizer_mapping*/ {},
-    };
+    GenericTensorAccessorW label_tensor_backing = create_random_filled_accessor_w(
+        output_tensor_shape, allocator);
 
     // construct computation graph
     ComputationGraph computation_graph = make_empty_computation_graph();
 
-    TensorShape input_tensor_shape = TensorShape{
-        TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
 
     TensorShape weight_shape_1 = TensorShape{
         TensorDims{FFOrdered{data_dim, hidden_dim}}, DataType::FLOAT};
     TensorShape weight_shape_2 = TensorShape{
         TensorDims{FFOrdered{hidden_dim, output_dim}}, DataType::FLOAT};
 
+    GenericTensorAccessorW weight_1_backing = create_random_filled_accessor_w(
+        weight_shape_1, allocator);
+    GenericTensorAccessorW weight_2_backing = create_random_filled_accessor_w(
+        weight_shape_2, allocator);
+
     LayerAddedResult inputs_layer =
         add_input_layer_with_grad(computation_graph, input_tensor_shape);
+    tensor_guid_t input_tensor_guid = get_only(inputs_layer.outputs);
+    GenericTensorAccessorW input_tensor_backing = create_random_filled_accessor_w(
+        input_tensor_shape, allocator);
 
     LayerAddedResult weights_layer_1 = add_layer(
         computation_graph,
@@ -79,6 +77,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
                    std::nullopt},
         {},
         {});
+    tensor_guid_t weight_1_tensor_guid = get_only(weights_layer_1.outputs);
 
     LayerAddedResult weights_layer_2 = add_layer(
         computation_graph,
@@ -87,13 +86,14 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
                    std::nullopt},
         {},
         {});
+    tensor_guid_t weight_2_tensor_guid = get_only(weights_layer_2.outputs);
 
     LayerAddedResult linear_operator_1 = add_layer(
         computation_graph,
         LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{hidden_dim,
                                                        /*use_bias=*/false,
                                                        DataType::FLOAT,
-                                                       Activation::RELU,
+                                                       std::nullopt,
                                                        std::nullopt}},
                    std::nullopt},
         inputs_layer.outputs,
@@ -104,7 +104,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim,
                                                        /*use_bias=*/false,
                                                        DataType::FLOAT,
-                                                       Activation::RELU,
+                                                       std::nullopt,
                                                        std::nullopt}},
                    std::nullopt},
         linear_operator_1.outputs,
@@ -129,6 +129,17 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     GradientTensorSource gradient_tensor_source;
     OptimizerTensorSource optimizer_tensor_source;
 
+    AllocatedTensors allocated_tensors = AllocatedTensors{
+        /*tensor_type_backings=*/{
+            {TensorTypeVariant{label_tensor}, label_tensor_backing},
+            {TensorTypeVariant{input_tensor_guid}, input_tensor_backing},
+            {TensorTypeVariant{weight_1_tensor_guid}, weight_1_backing},
+            {TensorTypeVariant{weight_2_tensor_guid}, weight_2_backing},
+        },
+        /*gradient_mapping=*/{},
+        /*optimizer_mapping*/ {},
+    };
+
     LocalTrainingBacking local_training_backing =
         LocalTrainingBacking{allocator,
                              allocated_tensors,
@@ -162,8 +173,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     // Assert that each sample in the batch has a lower loss in last epoch than
     // the first epoch
+    std::cout << "Final loss values" << std::endl;
     GenericTensorAccessorR first_epoch_loss = loss_values.at(0);
+    std::cout << format_accessor_r_contents(first_epoch_loss) << std::endl;
+    
     GenericTensorAccessorR last_epoch = loss_values.back();
+    std::cout << format_accessor_r_contents(last_epoch) << std::endl;
+
     CHECK(did_loss_decrease(first_epoch_loss, last_epoch));
   }
 }
diff --git a/lib/task-spec/src/task-spec/ops/linear.cc b/lib/task-spec/src/task-spec/ops/linear.cc
index 5e56ccdc1b..b56931b3f3 100644
--- a/lib/task-spec/src/task-spec/ops/linear.cc
+++ b/lib/task-spec/src/task-spec/ops/linear.cc
@@ -1,5 +1,6 @@
 #include "task-spec/ops/linear.h"
 #include "kernels/linear_kernels.h"
+#include "kernels/format_accessor_contents.h"
 #include "op-attrs/ff_dim_t.h"
 #include "task-spec/task_argument_accessor.h"
 #include "utils/exception.h"
@@ -90,6 +91,12 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
+  std::cout << "Input tensor" << std::endl;
+  std::cout << format_accessor_r_contents(input) << std::endl;
+
+  std::cout << "Weight tensor" << std::endl;
+  std::cout << format_accessor_r_contents(weight) << std::endl;
+
   auto per_device_state =
       acc.get_argument<LinearPerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
@@ -105,7 +112,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
     bias_ptr = bias.get_float_ptr();
   }
 
-  return profile(forward_kernel,
+  auto result = profile(forward_kernel,
                  profiling,
                  "[Linear] forward_time = {:.2lf}ms\n",
                  per_device_state,
@@ -116,6 +123,11 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  in_dim.int_from_positive_int(),
                  out_dim.int_from_positive_int(),
                  batch_size.int_from_positive_int());
+  
+  std::cout << "Output tensor" << std::endl;
+  std::cout << format_accessor_w_contents(output) << std::endl;
+
+  return result;
 }
 
 static std::optional<float>
@@ -128,6 +140,12 @@ static std::optional<float>
   auto weight_grad = acc.get_tensor_grad<Permissions::RW>(WEIGHT);
   auto output_grad = acc.get_tensor_grad<Permissions::RW>(OUTPUT);
 
+  std::cout << "output grad tensor" << std::endl;
+  std::cout << format_accessor_w_contents(output_grad) << std::endl;
+
+  std::cout << "weight grad tensor" << std::endl;
+  std::cout << format_accessor_w_contents(weight_grad) << std::endl;
+
   auto per_device_state =
       acc.get_argument<LinearPerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
@@ -143,7 +161,7 @@ static std::optional<float>
   positive_int out_dim = output.shape.at(ff_dim_t{0_n});
   positive_int batch_size = positive_int{output.shape.num_elements() / out_dim};
 
-  return profile(backward_kernel,
+  auto result = profile(backward_kernel,
                  profiling,
                  "[Linear] backward_time = {:.2lf}ms\n",
                  per_device_state,
@@ -157,6 +175,13 @@ static std::optional<float>
                  in_dim.int_from_positive_int(),
                  out_dim.int_from_positive_int(),
                  batch_size.int_from_positive_int());
+  std::cout << "output grad tensor after backward kernel" << std::endl;
+  std::cout << format_accessor_w_contents(output_grad) << std::endl;
+
+  std::cout << "weight grad tensor after backward kernel" << std::endl;
+  std::cout << format_accessor_w_contents(weight_grad) << std::endl;
+
+  return result;
 }
 
 TaskImplFunction get_linear_init_task_impl() {

From dbbb57434600d14d1b74da7513a8eeeca98df594 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Wed, 11 Jun 2025 08:47:32 -0700
Subject: [PATCH 83/91] Expose test utils

---
 lib/kernels/{test/src/internal => include/kernels}/test_utils.h | 0
 lib/kernels/{test/src/internal => src}/test_utils.cc            | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename lib/kernels/{test/src/internal => include/kernels}/test_utils.h (100%)
 rename lib/kernels/{test/src/internal => src}/test_utils.cc (99%)

diff --git a/lib/kernels/test/src/internal/test_utils.h b/lib/kernels/include/kernels/test_utils.h
similarity index 100%
rename from lib/kernels/test/src/internal/test_utils.h
rename to lib/kernels/include/kernels/test_utils.h
diff --git a/lib/kernels/test/src/internal/test_utils.cc b/lib/kernels/src/test_utils.cc
similarity index 99%
rename from lib/kernels/test/src/internal/test_utils.cc
rename to lib/kernels/src/test_utils.cc
index a9ba8dea13..67f2fb624a 100644
--- a/lib/kernels/test/src/internal/test_utils.cc
+++ b/lib/kernels/src/test_utils.cc
@@ -1,4 +1,4 @@
-#include "internal/test_utils.h"
+#include "kernels/test_utils.h"
 #include "op-attrs/tensor_shape.h"
 #include "utils/containers/require_all_same1.h"
 #include "utils/join_strings.h"

From a4c1ea4e1eddabec041c12b31092e8757c026be7 Mon Sep 17 00:00:00 2001
From: Reyna Abhyankar <forvirenra@gmail.com>
Date: Mon, 16 Jun 2025 22:25:37 -0700
Subject: [PATCH 84/91] Remove prints

---
 .../src/local_cost_estimator.cc               |  2 --
 lib/local-execution/src/loss_functions.cc     |  2 --
 .../src/model_training_instance.cc            | 12 -----------
 lib/local-execution/test/src/test_e2e.cc      |  3 ---
 lib/task-spec/src/task-spec/ops/linear.cc     | 20 -------------------
 5 files changed, 39 deletions(-)

diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
index 0a84c19066..85f315c7d1 100644
--- a/lib/local-execution/src/local_cost_estimator.cc
+++ b/lib/local-execution/src/local_cost_estimator.cc
@@ -95,10 +95,8 @@ CostDetails LocalCostEstimator::estimate_cost(
 
   float fwd =
       execute_forward(local_backing, operator_layer_guid, allocator).value();
-  std::cout << "completed forward" << std::endl;
   float bwd =
       execute_backward(local_backing, operator_layer_guid, allocator).value();
-  std::cout << "completed  backward" << std::endl;
 
   float total_execution_time = fwd + bwd;
 
diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc
index 99225b1895..4d0b32fd48 100644
--- a/lib/local-execution/src/loss_functions.cc
+++ b/lib/local-execution/src/loss_functions.cc
@@ -126,8 +126,6 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
                 scale_factor);
 
         
-        std::cout << "Logit grad (loss) tensor after computation" << std::endl;
-        std::cout << format_accessor_w_contents(logit_grad) << std::endl;
         break;
       }
       case LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE: {
diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc
index 847b1679ac..d3c1c65a68 100644
--- a/lib/local-execution/src/model_training_instance.cc
+++ b/lib/local-execution/src/model_training_instance.cc
@@ -35,16 +35,12 @@ PerLayerElapsedTime ModelTrainingInstance::backward() {
                this->label_tensor,
                this->allocator);
 
-  std::cout << "Done computing loss" << std::endl;
   gradient_tensor_t loss_tensor =
       this->training_backing.local_tensor_backing.tensor_gradient_mapping.at(
           this->logit_tensor);
   GenericTensorAccessorW loss_tensor_backing =
       this->training_backing.local_tensor_backing.tensor_backings.at(
           TensorTypeVariant{loss_tensor});
-  
-  std::cout << "Loss (logit grad) tensor" << std::endl;
-  std::cout << format_accessor_w_contents(loss_tensor_backing) << std::endl;
 
   PerLayerElapsedTime per_layer_elapsed_time;
   for (layer_guid_t const &node : reversed(
@@ -70,12 +66,6 @@ GenericTensorAccessorR ModelTrainingInstance::get_loss_tensor_accessor() const {
   GenericTensorAccessorW logit_tensor_backing = this->training_backing
       .local_tensor_backing.tensor_backings.at(TensorTypeVariant{this->logit_tensor});
 
-  // for (auto const &pair :
-  //      this->training_backing.local_tensor_backing.tensor_backings) {
-  //   std::cout << "Tensor type: " << pair.first << std::endl;
-  //   std::cout << "Tensor " << std::endl;
-  //   std::cout << format_accessor_w_contents(pair.second) << std::endl;
-  // }
 
   gradient_tensor_t loss_tensor =
       this->training_backing.local_tensor_backing.tensor_gradient_mapping.at(
@@ -84,8 +74,6 @@ GenericTensorAccessorR ModelTrainingInstance::get_loss_tensor_accessor() const {
       this->training_backing.local_tensor_backing.tensor_backings.at(
           TensorTypeVariant{loss_tensor});
   
-  std::cout << "Loss (logit grad) tensor" << std::endl;
-  std::cout << format_accessor_w_contents(loss_tensor_backing) << std::endl;
   return read_only_accessor_from_write_accessor(loss_tensor_backing);
 }
 
diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc
index 80b2e6a398..de759e2e01 100644
--- a/lib/local-execution/test/src/test_e2e.cc
+++ b/lib/local-execution/test/src/test_e2e.cc
@@ -173,12 +173,9 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     // Assert that each sample in the batch has a lower loss in last epoch than
     // the first epoch
-    std::cout << "Final loss values" << std::endl;
     GenericTensorAccessorR first_epoch_loss = loss_values.at(0);
-    std::cout << format_accessor_r_contents(first_epoch_loss) << std::endl;
     
     GenericTensorAccessorR last_epoch = loss_values.back();
-    std::cout << format_accessor_r_contents(last_epoch) << std::endl;
 
     CHECK(did_loss_decrease(first_epoch_loss, last_epoch));
   }
diff --git a/lib/task-spec/src/task-spec/ops/linear.cc b/lib/task-spec/src/task-spec/ops/linear.cc
index b56931b3f3..e8be7781f5 100644
--- a/lib/task-spec/src/task-spec/ops/linear.cc
+++ b/lib/task-spec/src/task-spec/ops/linear.cc
@@ -91,12 +91,6 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
-  std::cout << "Input tensor" << std::endl;
-  std::cout << format_accessor_r_contents(input) << std::endl;
-
-  std::cout << "Weight tensor" << std::endl;
-  std::cout << format_accessor_r_contents(weight) << std::endl;
-
   auto per_device_state =
       acc.get_argument<LinearPerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
@@ -123,9 +117,6 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  in_dim.int_from_positive_int(),
                  out_dim.int_from_positive_int(),
                  batch_size.int_from_positive_int());
-  
-  std::cout << "Output tensor" << std::endl;
-  std::cout << format_accessor_w_contents(output) << std::endl;
 
   return result;
 }
@@ -140,12 +131,6 @@ static std::optional<float>
   auto weight_grad = acc.get_tensor_grad<Permissions::RW>(WEIGHT);
   auto output_grad = acc.get_tensor_grad<Permissions::RW>(OUTPUT);
 
-  std::cout << "output grad tensor" << std::endl;
-  std::cout << format_accessor_w_contents(output_grad) << std::endl;
-
-  std::cout << "weight grad tensor" << std::endl;
-  std::cout << format_accessor_w_contents(weight_grad) << std::endl;
-
   auto per_device_state =
       acc.get_argument<LinearPerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
@@ -175,11 +160,6 @@ static std::optional<float>
                  in_dim.int_from_positive_int(),
                  out_dim.int_from_positive_int(),
                  batch_size.int_from_positive_int());
-  std::cout << "output grad tensor after backward kernel" << std::endl;
-  std::cout << format_accessor_w_contents(output_grad) << std::endl;
-
-  std::cout << "weight grad tensor after backward kernel" << std::endl;
-  std::cout << format_accessor_w_contents(weight_grad) << std::endl;
 
   return result;
 }

From 346f986e0db5f9f582311e6b06f8dccaa86fc943 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 18 Jun 2025 11:37:24 -0700
Subject: [PATCH 85/91] tweak: minor

---
 .../allocated_tensors.struct.toml             |   1 -
 .../fwd_bwd_op_task_impl_function.h           |  32 --
 .../generic_task_impl_function.h              |  33 --
 .../init_op_task_impl_function.h              |  33 --
 .../local-execution/itask_argument_accessor.h |  32 --
 .../local-execution/local_cpu_allocator.h     |  22 --
 .../local_task_argument_accessor.h            |   2 +-
 .../local_tensor_backing.struct.toml          |   3 -
 .../include/local-execution/loss_functions.h  |   2 +-
 .../local-execution/loss_tensor_source.h      |   3 +-
 .../local-execution/model_training_instance.h |   2 +-
 .../include/local-execution/ops/attention.h   |  26 --
 .../local-execution/ops/batch_matmul.h        |  24 --
 .../include/local-execution/ops/batch_norm.h  |  26 --
 .../include/local-execution/ops/cast.h        |  37 --
 .../include/local-execution/ops/combine.h     |  23 --
 .../include/local-execution/ops/concat.h      |  23 --
 .../include/local-execution/ops/conv_2d.h     |  26 --
 .../include/local-execution/ops/dropout.h     |  27 --
 .../local-execution/ops/element_binary.h      |  26 --
 .../local-execution/ops/element_unary.h       |  26 --
 .../include/local-execution/ops/embedding.h   |  23 --
 .../include/local-execution/ops/flat.h        |  23 --
 .../include/local-execution/ops/gather.h      |  26 --
 .../include/local-execution/ops/input.h       |  13 -
 .../include/local-execution/ops/layer_norm.h  |  26 --
 .../include/local-execution/ops/linear.h      |  26 --
 .../include/local-execution/ops/noop.h        |  13 -
 .../include/local-execution/ops/parallel_op.h |  40 --
 .../include/local-execution/ops/pool_2d.h     |  26 --
 .../include/local-execution/ops/reduce.h      |  26 --
 .../include/local-execution/ops/reduction.h   |  24 --
 .../include/local-execution/ops/repartition.h |  26 --
 .../include/local-execution/ops/replicate.h   |  23 --
 .../include/local-execution/ops/reshape.h     |  26 --
 .../include/local-execution/ops/reverse.h     |  23 --
 .../include/local-execution/ops/softmax.h     |  26 --
 .../include/local-execution/ops/split.h       |  23 --
 .../include/local-execution/ops/topk.h        |  26 --
 .../include/local-execution/ops/transpose.h   |  23 --
 .../include/local-execution/ops/weight.h      |  13 -
 .../include/local-execution/optimizer.h       |   2 +-
 .../include/local-execution/permissions.h     |  54 ---
 .../privilege_tensor_accessor.h               |  39 --
 .../local-execution/task_argument_accessor.h  | 153 --------
 .../task_impl_function.variant.toml           |  26 --
 .../local-execution/task_registry.struct.toml |   2 +-
 .../local-execution/task_signature_impl.h     |  20 -
 .../task_signature_impl.struct.toml           |  20 -
 .../local-execution/tracked_allocator.h       |   3 +
 lib/local-execution/src/allocated_tensors.cc  |  11 +-
 .../src/fwd_bwd_op_task_impl_function.cc      |  54 ---
 .../src/generic_task_impl_function.cc         |  53 ---
 .../src/init_op_task_impl_function.cc         |  53 ---
 .../src/local-execution/ops/attention.cc      | 259 -------------
 .../src/local-execution/ops/batch_matmul.cc   | 194 ----------
 .../src/local-execution/ops/batch_norm.cc     | 196 ----------
 .../src/local-execution/ops/cast.cc           | 110 ------
 .../src/local-execution/ops/combine.cc        |  94 -----
 .../src/local-execution/ops/concat.cc         | 107 -----
 .../src/local-execution/ops/conv_2d.cc        | 184 ---------
 .../src/local-execution/ops/dropout.cc        | 134 -------
 .../src/local-execution/ops/element_binary.cc | 180 ---------
 .../src/local-execution/ops/element_unary.cc  | 165 --------
 .../src/local-execution/ops/flat.cc           |  81 ----
 .../src/local-execution/ops/gather.cc         | 174 ---------
 .../src/local-execution/ops/input.cc          |   9 -
 .../src/local-execution/ops/layer_norm.cc     | 190 ---------
 .../src/local-execution/ops/linear.cc         | 210 ----------
 .../src/local-execution/ops/noop.cc           |  24 --
 .../src/local-execution/ops/pool_2d.cc        | 176 ---------
 .../src/local-execution/ops/reduce.cc         | 148 -------
 .../src/local-execution/ops/reduction.cc      | 101 -----
 .../src/local-execution/ops/repartition.cc    | 137 -------
 .../src/local-execution/ops/replicate.cc      |  99 -----
 .../src/local-execution/ops/reshape.cc        | 132 -------
 .../src/local-execution/ops/reverse.cc        | 135 -------
 .../src/local-execution/ops/softmax.cc        | 153 --------
 .../src/local-execution/ops/split.cc          | 140 -------
 .../src/local-execution/ops/topk.cc           | 162 --------
 .../src/local-execution/ops/transpose.cc      | 107 -----
 .../src/local-execution/ops/weight.cc         |   9 -
 .../src/local_cpu_allocator.cc                |  24 --
 .../src/local_task_argument_accessor.cc       |   7 +-
 .../src/local_training_backing.cc             |   2 +-
 lib/local-execution/src/loss_functions.cc     |  70 ++--
 lib/local-execution/src/loss_tensor_source.cc |   2 +-
 .../src/model_training_instance.cc            |  21 +-
 lib/local-execution/src/optimizer.cc          |  26 +-
 .../src/per_device_op_state.cc                |   0
 lib/local-execution/src/permissions.cc        |  72 ----
 lib/local-execution/src/task_registry.cc      |   2 +-
 .../src/task_signature_impl.cc                | 366 ------------------
 lib/local-execution/src/tracked_allocator.cc  |   7 +-
 .../src/unallocated_tensors.cc                |   1 -
 lib/local-execution/test/CMakeLists.txt       |   7 +-
 .../test/modify_test_commands.cmake           |  21 -
 .../test/src/test_allocated_tensors.cc        |  11 +-
 lib/local-execution/test/src/test_e2e.cc      | 104 ++---
 .../test/src/test_local_cost_estimator.cc     |  21 +-
 .../test/src/test_local_task_arg_accessor.cc  |  17 +-
 .../test/src/test_local_tensor_backing.cc     |   8 +-
 .../test/src/test_loss_functions.cc           |  24 +-
 .../test/src/test_task_registry.cc            |  10 +-
 .../test/src/test_unallocated_tensors.cc      |  11 +-
 lib/local-execution/test/src/test_update.cc   |  18 +-
 106 files changed, 209 insertions(+), 5847 deletions(-)
 delete mode 100644 lib/local-execution/include/local-execution/fwd_bwd_op_task_impl_function.h
 delete mode 100644 lib/local-execution/include/local-execution/generic_task_impl_function.h
 delete mode 100644 lib/local-execution/include/local-execution/init_op_task_impl_function.h
 delete mode 100644 lib/local-execution/include/local-execution/itask_argument_accessor.h
 delete mode 100644 lib/local-execution/include/local-execution/local_cpu_allocator.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/attention.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/batch_matmul.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/batch_norm.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/cast.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/combine.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/concat.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/conv_2d.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/dropout.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/element_binary.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/element_unary.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/embedding.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/flat.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/gather.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/input.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/layer_norm.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/linear.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/noop.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/parallel_op.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/pool_2d.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/reduce.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/reduction.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/repartition.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/replicate.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/reshape.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/reverse.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/softmax.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/split.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/topk.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/transpose.h
 delete mode 100644 lib/local-execution/include/local-execution/ops/weight.h
 delete mode 100644 lib/local-execution/include/local-execution/permissions.h
 delete mode 100644 lib/local-execution/include/local-execution/privilege_tensor_accessor.h
 delete mode 100644 lib/local-execution/include/local-execution/task_argument_accessor.h
 delete mode 100644 lib/local-execution/include/local-execution/task_impl_function.variant.toml
 delete mode 100644 lib/local-execution/include/local-execution/task_signature_impl.h
 delete mode 100644 lib/local-execution/include/local-execution/task_signature_impl.struct.toml
 delete mode 100644 lib/local-execution/src/fwd_bwd_op_task_impl_function.cc
 delete mode 100644 lib/local-execution/src/generic_task_impl_function.cc
 delete mode 100644 lib/local-execution/src/init_op_task_impl_function.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/attention.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/batch_matmul.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/batch_norm.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/cast.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/combine.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/concat.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/conv_2d.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/dropout.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/element_binary.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/element_unary.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/flat.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/gather.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/input.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/layer_norm.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/linear.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/noop.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/pool_2d.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/reduce.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/reduction.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/repartition.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/replicate.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/reshape.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/reverse.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/softmax.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/split.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/topk.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/transpose.cc
 delete mode 100644 lib/local-execution/src/local-execution/ops/weight.cc
 delete mode 100644 lib/local-execution/src/local_cpu_allocator.cc
 rename lib/{task-spec => local-execution}/src/per_device_op_state.cc (100%)
 delete mode 100644 lib/local-execution/src/permissions.cc
 delete mode 100644 lib/local-execution/src/task_signature_impl.cc
 delete mode 100644 lib/local-execution/test/modify_test_commands.cmake

diff --git a/lib/local-execution/include/local-execution/allocated_tensors.struct.toml b/lib/local-execution/include/local-execution/allocated_tensors.struct.toml
index 09245097b4..33985b0d74 100644
--- a/lib/local-execution/include/local-execution/allocated_tensors.struct.toml
+++ b/lib/local-execution/include/local-execution/allocated_tensors.struct.toml
@@ -3,7 +3,6 @@ name = "AllocatedTensors"
 features = [
   "eq",
   "fmt",
-  "hash",
 ]
 
 includes = [
diff --git a/lib/local-execution/include/local-execution/fwd_bwd_op_task_impl_function.h b/lib/local-execution/include/local-execution/fwd_bwd_op_task_impl_function.h
deleted file mode 100644
index cc82291f6a..0000000000
--- a/lib/local-execution/include/local-execution/fwd_bwd_op_task_impl_function.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_FWD_BWD_TASK_IMPL_FUNCTION_H
-#define _FLEXFLOW_LOCAL_EXECUTION_FWD_BWD_TASK_IMPL_FUNCTION_H
-
-#include "local-execution/task_argument_accessor.h"
-
-namespace FlexFlow {
-
-struct FwdBwdOpTaskImplFunction {
-
-  std::optional<float> (*function_ptr)(TaskArgumentAccessor const &);
-
-  bool operator==(FwdBwdOpTaskImplFunction const &) const;
-  bool operator!=(FwdBwdOpTaskImplFunction const &) const;
-  bool operator<(FwdBwdOpTaskImplFunction const &) const;
-  bool operator>(FwdBwdOpTaskImplFunction const &) const;
-  bool operator<=(FwdBwdOpTaskImplFunction const &) const;
-  bool operator>=(FwdBwdOpTaskImplFunction const &) const;
-};
-
-std::string format_as(FwdBwdOpTaskImplFunction const &x);
-std::ostream &operator<<(std::ostream &s, FwdBwdOpTaskImplFunction const &x);
-
-} // namespace FlexFlow
-
-namespace std {
-template <>
-struct hash<::FlexFlow::FwdBwdOpTaskImplFunction> {
-  size_t operator()(::FlexFlow::FwdBwdOpTaskImplFunction const &) const;
-};
-} // namespace std
-
-#endif
diff --git a/lib/local-execution/include/local-execution/generic_task_impl_function.h b/lib/local-execution/include/local-execution/generic_task_impl_function.h
deleted file mode 100644
index 9ce22ecf54..0000000000
--- a/lib/local-execution/include/local-execution/generic_task_impl_function.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_GENERIC_TASK_IMPL_FUNCTION_H
-#define _FLEXFLOW_LOCAL_EXECUTION_GENERIC_TASK_IMPL_FUNCTION_H
-
-#include "local-execution/task_argument_accessor.h"
-#include "task-spec/device_specific_device_states.dtg.h"
-
-namespace FlexFlow {
-
-struct GenericTaskImplFunction {
-
-  void (*function_ptr)(TaskArgumentAccessor const &);
-
-  bool operator==(GenericTaskImplFunction const &) const;
-  bool operator!=(GenericTaskImplFunction const &) const;
-  bool operator<(GenericTaskImplFunction const &) const;
-  bool operator>(GenericTaskImplFunction const &) const;
-  bool operator<=(GenericTaskImplFunction const &) const;
-  bool operator>=(GenericTaskImplFunction const &) const;
-};
-
-std::string format_as(GenericTaskImplFunction const &x);
-std::ostream &operator<<(std::ostream &s, GenericTaskImplFunction const &x);
-
-} // namespace FlexFlow
-
-namespace std {
-template <>
-struct hash<::FlexFlow::GenericTaskImplFunction> {
-  size_t operator()(::FlexFlow::GenericTaskImplFunction const &) const;
-};
-} // namespace std
-
-#endif
diff --git a/lib/local-execution/include/local-execution/init_op_task_impl_function.h b/lib/local-execution/include/local-execution/init_op_task_impl_function.h
deleted file mode 100644
index 0481e31a5f..0000000000
--- a/lib/local-execution/include/local-execution/init_op_task_impl_function.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_INIT_TASK_IMPL_FUNCTION_H
-#define _FLEXFLOW_LOCAL_EXECUTION_INIT_TASK_IMPL_FUNCTION_H
-
-#include "local-execution/task_argument_accessor.h"
-#include "task-spec/device_specific_device_states.dtg.h"
-
-namespace FlexFlow {
-
-struct InitOpTaskImplFunction {
-
-  DeviceSpecificDeviceStates (*function_ptr)(TaskArgumentAccessor const &);
-
-  bool operator==(InitOpTaskImplFunction const &) const;
-  bool operator!=(InitOpTaskImplFunction const &) const;
-  bool operator<(InitOpTaskImplFunction const &) const;
-  bool operator>(InitOpTaskImplFunction const &) const;
-  bool operator<=(InitOpTaskImplFunction const &) const;
-  bool operator>=(InitOpTaskImplFunction const &) const;
-};
-
-std::string format_as(InitOpTaskImplFunction const &x);
-std::ostream &operator<<(std::ostream &s, InitOpTaskImplFunction const &x);
-
-} // namespace FlexFlow
-
-namespace std {
-template <>
-struct hash<::FlexFlow::InitOpTaskImplFunction> {
-  size_t operator()(::FlexFlow::InitOpTaskImplFunction const &) const;
-};
-} // namespace std
-
-#endif
diff --git a/lib/local-execution/include/local-execution/itask_argument_accessor.h b/lib/local-execution/include/local-execution/itask_argument_accessor.h
deleted file mode 100644
index 24b3b3a37f..0000000000
--- a/lib/local-execution/include/local-execution/itask_argument_accessor.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_ITASK_ARGUMENT_ACCESSOR_H
-#define _FLEXFLOW_LOCAL_EXECUTION_ITASK_ARGUMENT_ACCESSOR_H
-
-#include "kernels/allocation.h"
-#include "local-execution/privilege_tensor_accessor.h"
-#include "task-spec/concrete_arg.h"
-#include "task-spec/op_task_signature.h"
-#include "task-spec/tensor_type.dtg.h"
-
-namespace FlexFlow {
-
-struct ITaskArgumentAccessor {
-  ITaskArgumentAccessor &operator=(ITaskArgumentAccessor const &) = delete;
-
-  virtual ~ITaskArgumentAccessor() = default;
-
-  virtual ConcreteArgSpec const &get_concrete_arg(slot_id_t) const = 0;
-
-  virtual GenericTensorAccessor get_tensor(slot_id_t slot,
-                                           Permissions priv,
-                                           TensorType tensor_type) const = 0;
-  virtual VariadicGenericTensorAccessor get_variadic_tensor(
-      slot_id_t slot, Permissions priv, TensorType tensor_type) const = 0;
-
-  virtual Allocator get_allocator() const = 0;
-  virtual size_t get_device_idx() const = 0;
-};
-CHECK_RC_COPY_VIRTUAL_COMPLIANT(ITaskArgumentAccessor);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/local_cpu_allocator.h b/lib/local-execution/include/local-execution/local_cpu_allocator.h
deleted file mode 100644
index d1e81facf2..0000000000
--- a/lib/local-execution/include/local-execution/local_cpu_allocator.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#include "kernels/allocation.h"
-#include <unordered_set>
-
-namespace FlexFlow {
-
-struct LocalCPUAllocator : public IAllocator {
-  LocalCPUAllocator() = default;
-  LocalCPUAllocator(LocalCPUAllocator const &) = delete;
-  LocalCPUAllocator(LocalCPUAllocator &&) = delete;
-  ~LocalCPUAllocator() = default;
-
-  void *allocate(size_t) override;
-  void deallocate(void *) override;
-
-private:
-  std::unordered_map<void *, std::unique_ptr<void, decltype(&free)>> ptrs;
-};
-CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalCPUAllocator);
-
-Allocator create_local_cpu_memory_allocator();
-
-} // namespace FlexFlow
diff --git a/lib/local-execution/include/local-execution/local_task_argument_accessor.h b/lib/local-execution/include/local-execution/local_task_argument_accessor.h
index c46534330b..184bf0b559 100644
--- a/lib/local-execution/include/local-execution/local_task_argument_accessor.h
+++ b/lib/local-execution/include/local-execution/local_task_argument_accessor.h
@@ -1,8 +1,8 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H
 #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H
 
-#include "local-execution/task_argument_accessor.h"
 #include "task-spec/slot_tensor_type_id.dtg.h"
+#include "task-spec/task_argument_accessor.h"
 #include <unordered_map>
 #include <variant>
 
diff --git a/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml b/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml
index c34063af5d..bd59ec325d 100644
--- a/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml
+++ b/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml
@@ -3,7 +3,6 @@ name = "LocalTensorBacking"
 features = [
   "eq",
   "fmt",
-  "hash"
 ]
 
 includes = [
@@ -15,9 +14,7 @@ includes = [
 ]
 
 src_includes = [
-  "utils/hash/unordered_map.h",
   "utils/fmt/unordered_map.h",
-  "utils/hash/vector.h",
   "utils/fmt/vector.h",
 ]
 
diff --git a/lib/local-execution/include/local-execution/loss_functions.h b/lib/local-execution/include/local-execution/loss_functions.h
index c06908503a..c75d4414de 100644
--- a/lib/local-execution/include/local-execution/loss_functions.h
+++ b/lib/local-execution/include/local-execution/loss_functions.h
@@ -16,10 +16,10 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_
 #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_
 
-#include "local-execution/task_impl_function.dtg.h"
 #include "op-attrs/ops/loss_functions.h"
 #include "pcg/tensor_guid_t.dtg.h"
 #include "task-spec/loss_tensor_t.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "task-spec/task_invocation.dtg.h"
 #include "task-spec/task_signature.h"
 
diff --git a/lib/local-execution/include/local-execution/loss_tensor_source.h b/lib/local-execution/include/local-execution/loss_tensor_source.h
index d9858cde40..b794207c7f 100644
--- a/lib/local-execution/include/local-execution/loss_tensor_source.h
+++ b/lib/local-execution/include/local-execution/loss_tensor_source.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_LOCAL_EXECUTION_LOSS_TENSOR_SOURCE_H
 
 #include "task-spec/loss_tensor_t.dtg.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 
 namespace FlexFlow {
 
@@ -12,7 +13,7 @@ struct LossTensorSource {
   loss_tensor_t new_loss_tensor();
 
 private:
-  static size_t next_available_loss_tensor_id;
+  static nonnegative_int next_available_loss_tensor_id;
 };
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h
index 2deed6b0a2..6f8f4b1543 100644
--- a/lib/local-execution/include/local-execution/model_training_instance.h
+++ b/lib/local-execution/include/local-execution/model_training_instance.h
@@ -30,7 +30,7 @@ struct ModelTrainingInstance {
   PerLayerElapsedTime forward();
   PerLayerElapsedTime backward();
   void update();
-  void write_loss_tensor_to_host(float *host_ptr);
+  GenericTensorAccessorR get_loss_tensor_accessor() const;
 };
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/include/local-execution/ops/attention.h b/lib/local-execution/include/local-execution/ops/attention.h
deleted file mode 100644
index bf5385f609..0000000000
--- a/lib/local-execution/include/local-execution/ops/attention.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef _FLEXFLOW_ATTENTION_H
-#define _FLEXFLOW_ATTENTION_H
-
-#include "local-execution/task_impl_function.dtg.h"
-#include "op-attrs/ops/attention.h"
-#include "task-spec/op_task_invocation.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(MultiHeadAttentionAttrs const &);
-
-TaskImplFunction get_attention_init_task_impl();
-TaskImplFunction get_attention_fwd_task_impl();
-TaskImplFunction get_attention_bwd_task_impl();
-
-OpTaskSignature get_attention_init_signature();
-OpTaskSignature get_attention_fwd_signature();
-OpTaskSignature get_attention_bwd_signature();
-
-OpTaskInvocation init(MultiHeadAttentionAttrs const &);
-OpTaskInvocation forward(MultiHeadAttentionAttrs const &);
-OpTaskInvocation backward(MultiHeadAttentionAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/batch_matmul.h b/lib/local-execution/include/local-execution/ops/batch_matmul.h
deleted file mode 100644
index 64d220ab66..0000000000
--- a/lib/local-execution/include/local-execution/ops/batch_matmul.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef _FLEXFLOW_BATCH_MATMUL_H
-#define _FLEXFLOW_BATCH_MATMUL_H
-
-#include "local-execution/task_impl_function.dtg.h"
-#include "op-attrs/ops/batch_matmul_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-#include "task-spec/op_task_signature.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(BatchMatmulAttrs const &);
-
-TaskImplFunction get_batch_matmul_fwd_task_impl();
-TaskImplFunction get_batch_matmul_bwd_task_impl();
-
-OpTaskSignature get_batch_matmul_fwd_signature();
-OpTaskSignature get_batch_matmul_bwd_signature();
-
-OpTaskInvocation forward(BatchMatmulAttrs const &);
-OpTaskInvocation backward(BatchMatmulAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/batch_norm.h b/lib/local-execution/include/local-execution/ops/batch_norm.h
deleted file mode 100644
index 85a7190ce1..0000000000
--- a/lib/local-execution/include/local-execution/ops/batch_norm.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef _FLEXFLOW_BATCH_NORM_H
-#define _FLEXFLOW_BATCH_NORM_H
-
-#include "local-execution/task_impl_function.dtg.h"
-#include "op-attrs/ops/batch_norm_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(BatchNormAttrs const &);
-
-TaskImplFunction get_batch_norm_init_task_impl();
-TaskImplFunction get_batch_norm_fwd_task_impl();
-TaskImplFunction get_batch_norm_bwd_task_impl();
-
-OpTaskSignature get_batch_norm_init_signature();
-OpTaskSignature get_batch_norm_fwd_signature();
-OpTaskSignature get_batch_norm_bwd_signature();
-
-OpTaskInvocation init(BatchNormAttrs const &);
-OpTaskInvocation forward(BatchNormAttrs const &);
-OpTaskInvocation backward(BatchNormAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/cast.h b/lib/local-execution/include/local-execution/ops/cast.h
deleted file mode 100644
index 6a27ad267a..0000000000
--- a/lib/local-execution/include/local-execution/ops/cast.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef _FLEXFLOW_CAST_H
-#define _FLEXFLOW_CAST_H
-
-#include "local-execution/task_impl_function.dtg.h"
-#include "op-attrs/ops/cast_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(CastAttrs const &);
-
-TaskImplFunction get_cast_fwd_task_impl();
-TaskImplFunction get_cast_bwd_task_impl();
-
-OpTaskSignature get_cast_fwd_signature();
-OpTaskSignature get_cast_bwd_signature();
-
-OpTaskInvocation forward(CastAttrs const &);
-OpTaskInvocation backward(CastAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/combine.h b/lib/local-execution/include/local-execution/ops/combine.h
deleted file mode 100644
index 00e9cbed2c..0000000000
--- a/lib/local-execution/include/local-execution/ops/combine.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef _FLEXFLOW_COMBINE_H
-#define _FLEXFLOW_COMBINE_H
-
-#include "local-execution/task_impl_function.dtg.h"
-#include "op-attrs/ops/combine_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(CombineAttrs const &);
-
-TaskImplFunction get_combine_fwd_task_impl();
-TaskImplFunction get_combine_bwd_task_impl();
-
-OpTaskSignature get_combine_fwd_signature();
-OpTaskSignature get_combine_bwd_signature();
-
-OpTaskInvocation forward(CombineAttrs const &);
-OpTaskInvocation backward(CombineAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/concat.h b/lib/local-execution/include/local-execution/ops/concat.h
deleted file mode 100644
index c46164e417..0000000000
--- a/lib/local-execution/include/local-execution/ops/concat.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef _FLEXFLOW_CONCAT_H
-#define _FLEXFLOW_CONCAT_H
-
-#include "local-execution/task_impl_function.dtg.h"
-#include "op-attrs/ops/concat_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(ConcatAttrs const &);
-
-TaskImplFunction get_concat_fwd_task_impl();
-TaskImplFunction get_concat_bwd_task_impl();
-
-OpTaskSignature get_concat_fwd_signature();
-OpTaskSignature get_concat_bwd_signature();
-
-OpTaskInvocation forward(ConcatAttrs const &);
-OpTaskInvocation backward(ConcatAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/conv_2d.h b/lib/local-execution/include/local-execution/ops/conv_2d.h
deleted file mode 100644
index f3bb34ffeb..0000000000
--- a/lib/local-execution/include/local-execution/ops/conv_2d.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef _FLEXFLOW_CONV_2D_H
-#define _FLEXFLOW_CONV_2D_H
-
-#include "local-execution/task_impl_function.dtg.h"
-#include "op-attrs/ops/conv_2d_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(Conv2DAttrs const &);
-
-TaskImplFunction get_conv_2d_init_task_impl();
-TaskImplFunction get_conv_2d_fwd_task_impl();
-TaskImplFunction get_conv_2d_bwd_task_impl();
-
-OpTaskSignature get_conv_2d_init_signature();
-OpTaskSignature get_conv_2d_fwd_signature();
-OpTaskSignature get_conv_2d_bwd_signature();
-
-OpTaskInvocation init(Conv2DAttrs const &);
-OpTaskInvocation forward(Conv2DAttrs const &);
-OpTaskInvocation backward(Conv2DAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/dropout.h b/lib/local-execution/include/local-execution/ops/dropout.h
deleted file mode 100644
index bd7b426c6b..0000000000
--- a/lib/local-execution/include/local-execution/ops/dropout.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef _FLEXFLOW_DROPOUT_H
-#define _FLEXFLOW_DROPOUT_H
-
-#include "local-execution/task_impl_function.dtg.h"
-#include "op-attrs/ops/dropout_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-#include "task-spec/task_id_t.dtg.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(DropoutAttrs const &);
-
-TaskImplFunction get_dropout_init_task_impl();
-TaskImplFunction get_dropout_fwd_task_impl();
-TaskImplFunction get_dropout_bwd_task_impl();
-
-OpTaskSignature get_dropout_init_signature();
-OpTaskSignature get_dropout_fwd_signature();
-OpTaskSignature get_dropout_bwd_signature();
-
-OpTaskInvocation init(DropoutAttrs const &);
-OpTaskInvocation forward(DropoutAttrs const &);
-OpTaskInvocation backward(DropoutAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/element_binary.h b/lib/local-execution/include/local-execution/ops/element_binary.h
deleted file mode 100644
index 4e0bb46e47..0000000000
--- a/lib/local-execution/include/local-execution/ops/element_binary.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef _FLEXFLOW_ELEMENT_BINARY_H
-#define _FLEXFLOW_ELEMENT_BINARY_H
-
-#include "local-execution/task_impl_function.dtg.h"
-#include "local-execution/task_signature_impl.h"
-#include "op-attrs/ops/element_binary_attrs.dtg.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(ElementBinaryAttrs const &);
-
-OpTaskInvocation init(ElementBinaryAttrs const &);
-OpTaskInvocation forward(ElementBinaryAttrs const &);
-OpTaskInvocation backward(ElementBinaryAttrs const &);
-
-TaskImplFunction get_element_binary_init_task_impl();
-TaskImplFunction get_element_binary_fwd_task_impl();
-TaskImplFunction get_element_binary_bwd_task_impl();
-
-OpTaskSignature get_element_binary_init_signature();
-OpTaskSignature get_element_binary_fwd_signature();
-OpTaskSignature get_element_binary_bwd_signature();
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/element_unary.h b/lib/local-execution/include/local-execution/ops/element_unary.h
deleted file mode 100644
index 9900668d6c..0000000000
--- a/lib/local-execution/include/local-execution/ops/element_unary.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef _ELEMENT_UNARY_H
-#define _ELEMENT_UNARY_H
-
-#include "local-execution/task_impl_function.dtg.h"
-#include "op-attrs/ops/element_unary_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(ElementUnaryAttrs const &);
-
-TaskImplFunction get_element_unary_init_task_impl();
-TaskImplFunction get_element_unary_fwd_task_impl();
-TaskImplFunction get_element_unary_bwd_task_impl();
-
-OpTaskSignature get_element_unary_init_signature();
-OpTaskSignature get_element_unary_fwd_signature();
-OpTaskSignature get_element_unary_bwd_signature();
-
-OpTaskInvocation init(ElementUnaryAttrs const &);
-OpTaskInvocation forward(ElementUnaryAttrs const &);
-OpTaskInvocation backward(ElementUnaryAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/embedding.h b/lib/local-execution/include/local-execution/ops/embedding.h
deleted file mode 100644
index b998aef53e..0000000000
--- a/lib/local-execution/include/local-execution/ops/embedding.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef _FLEXFLOW_EMBEDDING_H
-#define _FLEXFLOW_EMBEDDING_H
-
-#include "local-execution/task_impl_function.dtg.h"
-#include "op-attrs/ops/embedding_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(EmbeddingAttrs const &);
-
-TaskImplFunction get_embedding_fwd_task_impl();
-TaskImplFunction get_embedding_bwd_task_impl();
-
-OpTaskSignature get_embedding_fwd_signature();
-OpTaskSignature get_embedding_bwd_signature();
-
-OpTaskInvocation forward(EmbeddingAttrs const &);
-OpTaskInvocation backward(EmbeddingAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/flat.h b/lib/local-execution/include/local-execution/ops/flat.h
deleted file mode 100644
index 95afb98340..0000000000
--- a/lib/local-execution/include/local-execution/ops/flat.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef _FLEXFLOW_FLAT_H
-#define _FLEXFLOW_FLAT_H
-
-#include "local-execution/task_impl_function.dtg.h"
-#include "op-attrs/ops/flat_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(FlatAttrs const &);
-
-TaskImplFunction get_flat_fwd_task_impl();
-TaskImplFunction get_flat_bwd_task_impl();
-
-OpTaskSignature get_flat_fwd_signature();
-OpTaskSignature get_flat_bwd_signature();
-
-OpTaskInvocation forward(FlatAttrs const &);
-OpTaskInvocation backward(FlatAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/gather.h b/lib/local-execution/include/local-execution/ops/gather.h
deleted file mode 100644
index 5569a94728..0000000000
--- a/lib/local-execution/include/local-execution/ops/gather.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef _FLEXFLOW_GATHER_H
-#define _FLEXFLOW_GATHER_H
-
-#include "local-execution/task_impl_function.dtg.h"
-#include "op-attrs/ops/gather_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(GatherAttrs const &);
-
-TaskImplFunction get_gather_init_task_impl();
-TaskImplFunction get_gather_fwd_task_impl();
-TaskImplFunction get_gather_bwd_task_impl();
-
-OpTaskSignature get_gather_init_signature();
-OpTaskSignature get_gather_fwd_signature();
-OpTaskSignature get_gather_bwd_signature();
-
-OpTaskInvocation init(GatherAttrs const &);
-OpTaskInvocation forward(GatherAttrs const &);
-OpTaskInvocation backward(GatherAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/input.h b/lib/local-execution/include/local-execution/ops/input.h
deleted file mode 100644
index 9181478363..0000000000
--- a/lib/local-execution/include/local-execution/ops/input.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef _FLEXFLOW_INPUT_H
-#define _FLEXFLOW_INPUT_H
-
-#include "op-attrs/ops/input_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(InputAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/layer_norm.h b/lib/local-execution/include/local-execution/ops/layer_norm.h
deleted file mode 100644
index e4a15caac2..0000000000
--- a/lib/local-execution/include/local-execution/ops/layer_norm.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_OPS_LAYER_NORM_H
-#define _FLEXFLOW_RUNTIME_SRC_OPS_LAYER_NORM_H
-
-#include "local-execution/task_impl_function.dtg.h"
-#include "op-attrs/ops/layer_norm_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(LayerNormAttrs const &);
-
-TaskImplFunction get_layer_norm_init_task_impl();
-TaskImplFunction get_layer_norm_fwd_task_impl();
-TaskImplFunction get_layer_norm_bwd_task_impl();
-
-OpTaskSignature get_layer_norm_init_signature();
-OpTaskSignature get_layer_norm_fwd_signature();
-OpTaskSignature get_layer_norm_bwd_signature();
-
-OpTaskInvocation init(LayerNormAttrs const &);
-OpTaskInvocation forward(LayerNormAttrs const &);
-OpTaskInvocation backward(LayerNormAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/linear.h b/lib/local-execution/include/local-execution/ops/linear.h
deleted file mode 100644
index d58d876865..0000000000
--- a/lib/local-execution/include/local-execution/ops/linear.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef _FLEXFLOW_LINEAR_H
-#define _FLEXFLOW_LINEAR_H
-
-#include "local-execution/task_impl_function.dtg.h"
-#include "op-attrs/ops/linear_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(LinearAttrs const &);
-
-OpTaskInvocation init(LinearAttrs const &);
-OpTaskInvocation forward(LinearAttrs const &);
-OpTaskInvocation backward(LinearAttrs const &);
-
-TaskImplFunction get_linear_init_task_impl();
-TaskImplFunction get_linear_fwd_task_impl();
-TaskImplFunction get_linear_bwd_task_impl();
-
-OpTaskSignature get_linear_init_signature();
-OpTaskSignature get_linear_fwd_signature();
-OpTaskSignature get_linear_bwd_signature();
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/noop.h b/lib/local-execution/include/local-execution/ops/noop.h
deleted file mode 100644
index adbc15cd3b..0000000000
--- a/lib/local-execution/include/local-execution/ops/noop.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef _FLEXFLOW_NOOP_H
-#define _FLEXFLOW_NOOP_H
-
-#include "op-attrs/ops/noop_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(NoopAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/parallel_op.h b/lib/local-execution/include/local-execution/ops/parallel_op.h
deleted file mode 100644
index e7bd98b8a8..0000000000
--- a/lib/local-execution/include/local-execution/ops/parallel_op.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef _FLEXFLOW_PARALLEL_OP_H
-#define _FLEXFLOW_PARALLEL_OP_H
-
-#include "parallel_op_info.h"
-#include "utils/optional.h"
-
-namespace FlexFlow {
-
-struct ParallelOpJoinResult {
-  std::optional<ParallelOpInfo> op = std::nullopt;
-  bool join_did_succeed = false;
-};
-
-ParallelOpJoinResult try_join_parallel_ops(ParallelOpInfo const &,
-                                           ParallelOpInfo const &);
-
-/* class ParallelOp : public Op { */
-/* public: */
-/*   ParallelOp(FFModel &model, */
-/*              OperatorType type, */
-/*              char const *_name, */
-/*              const ParallelTensor input); */
-/*   virtual void init(FFModel const &) = 0; */
-/*   virtual void forward(FFModel const &) = 0; */
-/*   virtual void backward(FFModel const &) = 0; */
-/*   virtual void create_input_partition(FFModel &model) = 0; */
-/*   virtual bool measure_operator_cost(Simulator *sim, */
-/*                                      MachineView const &pc, */
-/*                                      CostMetrics &cost_metrics) const = 0; */
-/*   virtual bool append_parallel_op_info( */
-/*       std::vector<ParallelOpInfo> &parallel_ops) const = 0; */
-/*   virtual bool is_parallel_op() const; */
-
-/* public: */
-/*   Legion::LogicalPartition input_lp, output_grad_lp; */
-/* }; */
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/pool_2d.h b/lib/local-execution/include/local-execution/ops/pool_2d.h
deleted file mode 100644
index 7d0ec44bd7..0000000000
--- a/lib/local-execution/include/local-execution/ops/pool_2d.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef _FLEXFLOW_POOL_2D_H
-#define _FLEXFLOW_POOL_2D_H
-
-#include "local-execution/task_impl_function.dtg.h"
-#include "op-attrs/ops/pool_2d_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(Pool2DAttrs const &);
-
-TaskImplFunction get_pool_2d_init_task_impl();
-TaskImplFunction get_pool_2d_fwd_task_impl();
-TaskImplFunction get_pool_2d_bwd_task_impl();
-
-OpTaskSignature get_pool_2d_init_signature();
-OpTaskSignature get_pool_2d_fwd_signature();
-OpTaskSignature get_pool_2d_bwd_signature();
-
-OpTaskInvocation init(Pool2DAttrs const &);
-OpTaskInvocation forward(Pool2DAttrs const &);
-OpTaskInvocation backward(Pool2DAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/reduce.h b/lib/local-execution/include/local-execution/ops/reduce.h
deleted file mode 100644
index 5c6d4be338..0000000000
--- a/lib/local-execution/include/local-execution/ops/reduce.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H
-#define _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H
-
-#include "local-execution/task_impl_function.dtg.h"
-#include "op-attrs/ops/reduce_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(ReduceAttrs const &);
-
-TaskImplFunction get_reduce_init_task_impl();
-TaskImplFunction get_reduce_fwd_task_impl();
-TaskImplFunction get_reduce_bwd_task_impl();
-
-OpTaskSignature get_reduce_init_signature();
-OpTaskSignature get_reduce_fwd_signature();
-OpTaskSignature get_reduce_bwd_signature();
-
-OpTaskInvocation init(ReduceAttrs const &);
-OpTaskInvocation forward(ReduceAttrs const &);
-OpTaskInvocation backward(ReduceAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/reduction.h b/lib/local-execution/include/local-execution/ops/reduction.h
deleted file mode 100644
index 7475d3aeb4..0000000000
--- a/lib/local-execution/include/local-execution/ops/reduction.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef _FLEXFLOW_REDUCTION_H
-#define _FLEXFLOW_REDUCTION_H
-
-#include "local-execution/task_impl_function.dtg.h"
-#include "op-attrs/ops/reduction_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(ReductionAttrs const &);
-
-TaskImplFunction get_reduction_fwd_task_impl();
-TaskImplFunction get_reduction_bwd_task_impl();
-
-OpTaskSignature get_reduction_fwd_signature();
-OpTaskSignature get_reduction_bwd_signature();
-
-OpTaskInvocation init(ReductionAttrs const &);
-OpTaskInvocation forward(ReductionAttrs const &);
-OpTaskInvocation backward(ReductionAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/repartition.h b/lib/local-execution/include/local-execution/ops/repartition.h
deleted file mode 100644
index 08ecdafcf2..0000000000
--- a/lib/local-execution/include/local-execution/ops/repartition.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef _FLEXFLOW_PARTITION_H
-#define _FLEXFLOW_PARTITION_H
-
-#include "local-execution/task_impl_function.dtg.h"
-#include "op-attrs/ops/repartition_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(RepartitionAttrs const &);
-
-TaskImplFunction get_repartition_init_task_impl();
-TaskImplFunction get_repartition_fwd_task_impl();
-TaskImplFunction get_repartition_bwd_task_impl();
-
-OpTaskSignature get_repartition_init_signature();
-OpTaskSignature get_repartition_fwd_signature();
-OpTaskSignature get_repartition_bwd_signature();
-
-OpTaskInvocation init(RepartitionAttrs const &);
-OpTaskInvocation forward(RepartitionAttrs const &);
-OpTaskInvocation backward(RepartitionAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/replicate.h b/lib/local-execution/include/local-execution/ops/replicate.h
deleted file mode 100644
index b827b9c272..0000000000
--- a/lib/local-execution/include/local-execution/ops/replicate.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef _FLEXFLOW_REPLICATE_H
-#define _FLEXFLOW_REPLICATE_H
-
-#include "local-execution/task_impl_function.dtg.h"
-#include "op-attrs/ops/replicate_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(ReplicateAttrs const &);
-
-TaskImplFunction get_replicate_fwd_task_impl();
-TaskImplFunction get_replicate_bwd_task_impl();
-
-OpTaskSignature get_replicate_fwd_signature();
-OpTaskSignature get_replicate_bwd_signature();
-
-OpTaskInvocation forward(ReplicateAttrs const &);
-OpTaskInvocation backward(ReplicateAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/reshape.h b/lib/local-execution/include/local-execution/ops/reshape.h
deleted file mode 100644
index ed7e6e9e31..0000000000
--- a/lib/local-execution/include/local-execution/ops/reshape.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef _FLEXFLOW_RESHAPE_H
-#define _FLEXFLOW_RESHAPE_H
-
-#include "local-execution/task_impl_function.dtg.h"
-#include "op-attrs/ops/reshape_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(ReshapeAttrs const &);
-
-TaskImplFunction get_reshape_init_task_impl();
-TaskImplFunction get_reshape_fwd_task_impl();
-TaskImplFunction get_reshape_bwd_task_impl();
-
-OpTaskSignature get_reshape_init_signature();
-OpTaskSignature get_reshape_fwd_signature();
-OpTaskSignature get_reshape_bwd_signature();
-
-OpTaskInvocation init(ReshapeAttrs const &);
-OpTaskInvocation forward(ReshapeAttrs const &);
-OpTaskInvocation backward(ReshapeAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/reverse.h b/lib/local-execution/include/local-execution/ops/reverse.h
deleted file mode 100644
index dd0e89ecad..0000000000
--- a/lib/local-execution/include/local-execution/ops/reverse.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef _FLEXFLOW_REVERSE_H_
-#define _FLEXFLOW_REVERSE_H_
-
-#include "local-execution/task_impl_function.dtg.h"
-#include "op-attrs/ops/reverse_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(ReverseAttrs const &);
-
-TaskImplFunction get_reverse_fwd_task_impl();
-TaskImplFunction get_reverse_bwd_task_impl();
-
-OpTaskSignature get_reverse_fwd_signature();
-OpTaskSignature get_reverse_bwd_signature();
-
-OpTaskInvocation forward(ReverseAttrs const &);
-OpTaskInvocation backward(ReverseAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/softmax.h b/lib/local-execution/include/local-execution/ops/softmax.h
deleted file mode 100644
index 294d948b42..0000000000
--- a/lib/local-execution/include/local-execution/ops/softmax.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef _FLEXFLOW_SOFTMAX_H
-#define _FLEXFLOW_SOFTMAX_H
-
-#include "local-execution/task_impl_function.dtg.h"
-#include "op-attrs/ops/softmax_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(SoftmaxAttrs const &);
-
-TaskImplFunction get_softmax_init_task_impl();
-TaskImplFunction get_softmax_fwd_task_impl();
-TaskImplFunction get_softmax_bwd_task_impl();
-
-OpTaskSignature get_softmax_init_signature();
-OpTaskSignature get_softmax_fwd_signature();
-OpTaskSignature get_softmax_bwd_signature();
-
-OpTaskInvocation init(SoftmaxAttrs const &);
-OpTaskInvocation forward(SoftmaxAttrs const &);
-OpTaskInvocation backward(SoftmaxAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/split.h b/lib/local-execution/include/local-execution/ops/split.h
deleted file mode 100644
index 49cd7cfc7b..0000000000
--- a/lib/local-execution/include/local-execution/ops/split.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef _FLEXFLOW_SPLIT_H
-#define _FLEXFLOW_SPLIT_H
-
-#include "local-execution/task_impl_function.dtg.h"
-#include "op-attrs/ops/split_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(SplitAttrs const &);
-
-TaskImplFunction get_split_fwd_task_impl();
-TaskImplFunction get_split_bwd_task_impl();
-
-OpTaskSignature get_split_fwd_signature();
-OpTaskSignature get_split_bwd_signature();
-
-OpTaskInvocation forward(SplitAttrs const &);
-OpTaskInvocation backward(SplitAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/topk.h b/lib/local-execution/include/local-execution/ops/topk.h
deleted file mode 100644
index aeded512cd..0000000000
--- a/lib/local-execution/include/local-execution/ops/topk.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef _FLEXFLOW_TOPK_H_
-#define _FLEXFLOW_TOPK_H_
-
-#include "local-execution/task_impl_function.dtg.h"
-#include "op-attrs/ops/topk_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(TopKAttrs const &);
-
-TaskImplFunction get_topk_init_task_impl();
-TaskImplFunction get_topk_fwd_task_impl();
-TaskImplFunction get_topk_bwd_task_impl();
-
-OpTaskSignature get_topk_init_signature();
-OpTaskSignature get_topk_fwd_signature();
-OpTaskSignature get_topk_bwd_signature();
-
-OpTaskInvocation init(TopKAttrs const &);
-OpTaskInvocation forward(TopKAttrs const &);
-OpTaskInvocation backward(TopKAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/transpose.h b/lib/local-execution/include/local-execution/ops/transpose.h
deleted file mode 100644
index 2c7b5fb3bc..0000000000
--- a/lib/local-execution/include/local-execution/ops/transpose.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef _FLEXFLOW_TRANSPOSE_H_
-#define _FLEXFLOW_TRANSPOSE_H_
-
-#include "local-execution/task_impl_function.dtg.h"
-#include "op-attrs/ops/transpose_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(TransposeAttrs const &);
-
-TaskImplFunction get_transpose_fwd_task_impl();
-TaskImplFunction get_transpose_bwd_task_impl();
-
-OpTaskSignature get_transpose_fwd_signature();
-OpTaskSignature get_transpose_bwd_signature();
-
-OpTaskInvocation forward(TransposeAttrs const &);
-OpTaskInvocation backward(TransposeAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/ops/weight.h b/lib/local-execution/include/local-execution/ops/weight.h
deleted file mode 100644
index 162236e41e..0000000000
--- a/lib/local-execution/include/local-execution/ops/weight.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef _FLEXFLOW_WEIGHT_H
-#define _FLEXFLOW_WEIGHT_H
-
-#include "op-attrs/ops/weight_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(WeightAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h
index f6bd5a3ee9..e4a9c78743 100644
--- a/lib/local-execution/include/local-execution/optimizer.h
+++ b/lib/local-execution/include/local-execution/optimizer.h
@@ -1,10 +1,10 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_
 #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_
 
-#include "local-execution/task_impl_function.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
 #include "pcg/optimizers/adam_optimizer_attrs.dtg.h"
 #include "pcg/optimizers/sgd_optimizer_attrs.dtg.h"
+#include "task-spec/task_impl_function.dtg.h"
 #include "task-spec/task_invocation.dtg.h"
 #include "task-spec/task_signature.h"
 
diff --git a/lib/local-execution/include/local-execution/permissions.h b/lib/local-execution/include/local-execution/permissions.h
deleted file mode 100644
index f34969f233..0000000000
--- a/lib/local-execution/include/local-execution/permissions.h
+++ /dev/null
@@ -1,54 +0,0 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_PERMISSION_H
-#define _FLEXFLOW_LOCAL_EXECUTION_PERMISSION_H
-
-#include "utils/exception.h"
-#include "utils/fmt.h"
-
-namespace FlexFlow {
-
-enum class Permissions { NONE, RO, WO, RW };
-
-Permissions join(Permissions lhs, Permissions rhs);
-Permissions meet(Permissions lhs, Permissions rhs);
-
-bool operator<(Permissions lhs, Permissions rhs);
-bool operator<=(Permissions lhs, Permissions rhs);
-bool operator>(Permissions lhs, Permissions rhs);
-bool operator>=(Permissions lhs, Permissions rhs);
-
-} // namespace FlexFlow
-
-namespace fmt {
-
-template <>
-struct formatter<::FlexFlow::Permissions> : formatter<string_view> {
-  template <typename FormatContext>
-  auto format(::FlexFlow::Permissions p, FormatContext &ctx) const
-      -> decltype(ctx.out()) {
-    using ::FlexFlow::Permissions;
-
-    string_view name = "unknown";
-    switch (p) {
-      case Permissions::NONE:
-        name = "NO_PERMISSIONS";
-        break;
-      case Permissions::RO:
-        name = "READ_ONLY";
-        break;
-      case Permissions::WO:
-        name = "WRITE_ONLY";
-        break;
-      case Permissions::RW:
-        name = "READ_WRITE";
-        break;
-      default:
-        throw ::FlexFlow::mk_runtime_error(
-            fmt::format("Unknown permission {}", static_cast<int>(p)));
-    }
-    return formatter<string_view>::format(name, ctx);
-  }
-};
-
-} // namespace fmt
-
-#endif
diff --git a/lib/local-execution/include/local-execution/privilege_tensor_accessor.h b/lib/local-execution/include/local-execution/privilege_tensor_accessor.h
deleted file mode 100644
index aeae3c2e41..0000000000
--- a/lib/local-execution/include/local-execution/privilege_tensor_accessor.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_PRIVILEGE_TENSOR_ACCESSOR_H
-#define _FLEXFLOW_LOCAL_EXECUTION_PRIVILEGE_TENSOR_ACCESSOR_H
-
-#include "kernels/accessor.h"
-#include "local-execution/permissions.h"
-
-namespace FlexFlow {
-
-template <Permissions>
-struct privilege_mode_to_accessor_t {};
-
-template <>
-struct privilege_mode_to_accessor_t<Permissions::RW> {
-  using type = GenericTensorAccessorW;
-};
-
-template <>
-struct privilege_mode_to_accessor_t<Permissions::RO> {
-  using type = GenericTensorAccessorR;
-};
-
-template <>
-struct privilege_mode_to_accessor_t<Permissions::WO> {
-  using type = GenericTensorAccessorW;
-};
-
-template <Permissions PRIV>
-using privilege_mode_to_accessor =
-    typename privilege_mode_to_accessor_t<PRIV>::type;
-
-using GenericTensorAccessor =
-    std::variant<GenericTensorAccessorR, GenericTensorAccessorW>;
-using VariadicGenericTensorAccessor =
-    std::variant<std::vector<GenericTensorAccessorR>,
-                 std::vector<GenericTensorAccessorW>>;
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h
deleted file mode 100644
index 499b5ff7d6..0000000000
--- a/lib/local-execution/include/local-execution/task_argument_accessor.h
+++ /dev/null
@@ -1,153 +0,0 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H
-#define _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H
-
-#include "local-execution/itask_argument_accessor.h"
-#include "task-spec/device_specific.h"
-#include "task-spec/per_device_op_state.dtg.h"
-
-namespace FlexFlow {
-
-struct TaskArgumentAccessor {
-  // arguments
-  template <typename T>
-  T const &get_argument(slot_id_t slot) const {
-    if constexpr (PerDeviceOpState::IsPartOfPerDeviceOpState_v<T>) {
-      PerDeviceOpState device_states =
-          this->ptr->get_concrete_arg(slot).get<PerDeviceOpState>();
-      if (device_states.has<T>()) {
-        return device_states.get<T>();
-      } else {
-        throw mk_runtime_error(fmt::format(
-            "Invalid access to PerDeviceOpState attempted, instead it holds: ",
-            device_states.index()));
-      }
-    } else {
-      return this->ptr->get_concrete_arg(slot).get<T>();
-    }
-  }
-
-  template <typename T>
-  T const &get_argument(int slot) const {
-    return this->get_argument<T>(slot_id_t{slot});
-  }
-
-  // tensors
-  template <Permissions PRIV>
-  privilege_mode_to_accessor<PRIV> get_tensor(int slot) const {
-    return this->get_tensor<PRIV>(slot_id_t{slot});
-  }
-
-  template <Permissions PRIV>
-  privilege_mode_to_accessor<PRIV> get_tensor(slot_id_t slot) const {
-    return std::get<privilege_mode_to_accessor<PRIV>>(
-        this->ptr->get_tensor(slot, PRIV, TensorType::FORWARD));
-  }
-
-  template <Permissions PRIV>
-  privilege_mode_to_accessor<PRIV> get_tensor_grad(int slot) const {
-    return this->get_tensor_grad<PRIV>(slot_id_t{slot});
-  }
-
-  template <Permissions PRIV>
-  privilege_mode_to_accessor<PRIV> get_tensor_grad(slot_id_t slot) const {
-    return std::get<privilege_mode_to_accessor<PRIV>>(
-        this->ptr->get_tensor(slot, PRIV, TensorType::GRADIENT));
-  }
-
-  template <Permissions PRIV>
-  privilege_mode_to_accessor<PRIV> get_optimizer_tensor(int slot) const {
-    return this->get_optimizer_tensor<PRIV>(slot_id_t{slot});
-  }
-
-  template <Permissions PRIV>
-  privilege_mode_to_accessor<PRIV> get_optimizer_tensor(slot_id_t slot) const {
-    return std::get<privilege_mode_to_accessor<PRIV>>(
-        this->ptr->get_tensor(slot, PRIV, TensorType::OPTIMIZER));
-  }
-
-  template <Permissions PRIV>
-  privilege_mode_to_accessor<PRIV> get_loss_tensor(int slot) const {
-    return this->get_loss_tensor<PRIV>(slot_id_t{slot});
-  }
-
-  template <Permissions PRIV>
-  privilege_mode_to_accessor<PRIV> get_loss_tensor(slot_id_t slot) const {
-    return std::get<privilege_mode_to_accessor<PRIV>>(
-        this->ptr->get_tensor(slot, PRIV, TensorType::LOSS));
-  }
-
-  // variadic tensors
-  template <Permissions PRIV>
-  std::vector<privilege_mode_to_accessor<PRIV>>
-      get_variadic_tensor(int slot) const {
-    return this->get_variadic_tensor<PRIV>(slot_id_t{slot});
-  }
-
-  template <Permissions PRIV>
-  std::vector<privilege_mode_to_accessor<PRIV>>
-      get_variadic_tensor(slot_id_t slot) const {
-    return std::get<std::vector<privilege_mode_to_accessor<PRIV>>>(
-        this->ptr->get_variadic_tensor(slot, PRIV, TensorType::FORWARD));
-  }
-
-  template <Permissions PRIV>
-  std::vector<privilege_mode_to_accessor<PRIV>>
-      get_variadic_tensor_grad(int slot) const {
-    return this->get_variadic_tensor_grad<PRIV>(slot_id_t{slot});
-  }
-
-  template <Permissions PRIV>
-  std::vector<privilege_mode_to_accessor<PRIV>>
-      get_variadic_tensor_grad(slot_id_t slot) const {
-    return std::get<std::vector<privilege_mode_to_accessor<PRIV>>>(
-        this->ptr->get_variadic_tensor(slot, PRIV, TensorType::GRADIENT));
-  }
-
-  template <Permissions PRIV>
-  std::vector<privilege_mode_to_accessor<PRIV>>
-      get_variadic_optimizer_tensor(int slot) const {
-    return this->get_variadic_optimizer_tensor<PRIV>(slot_id_t{slot});
-  }
-
-  template <Permissions PRIV>
-  std::vector<privilege_mode_to_accessor<PRIV>>
-      get_variadic_optimizer_tensor(slot_id_t slot) const {
-    return std::get<std::vector<privilege_mode_to_accessor<PRIV>>>(
-        this->ptr->get_variadic_tensor(slot, PRIV, TensorType::OPTIMIZER));
-  }
-
-  template <Permissions PRIV>
-  std::vector<privilege_mode_to_accessor<PRIV>>
-      get_variadic_loss_tensor(int slot) const {
-    return this->get_variadic_loss_tensor<PRIV>(slot_id_t{slot});
-  }
-
-  template <Permissions PRIV>
-  std::vector<privilege_mode_to_accessor<PRIV>>
-      get_variadic_loss_tensor(slot_id_t slot) const {
-    return std::get<std::vector<privilege_mode_to_accessor<PRIV>>>(
-        this->ptr->get_variadic_tensor(slot, PRIV, TensorType::LOSS));
-  }
-
-  Allocator get_allocator() const {
-    return this->ptr->get_allocator();
-  }
-
-  template <typename T, typename... Args>
-  static
-      typename std::enable_if<std::is_base_of<ITaskArgumentAccessor, T>::value,
-                              TaskArgumentAccessor>::type
-      create(Args &&...args) {
-    return TaskArgumentAccessor(
-        std::make_shared<T>(std::forward<Args>(args)...));
-  }
-
-private:
-  TaskArgumentAccessor(std::shared_ptr<ITaskArgumentAccessor const> ptr)
-      : ptr(ptr) {}
-  std::shared_ptr<ITaskArgumentAccessor const> ptr;
-};
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/task_impl_function.variant.toml b/lib/local-execution/include/local-execution/task_impl_function.variant.toml
deleted file mode 100644
index 48cab9eb01..0000000000
--- a/lib/local-execution/include/local-execution/task_impl_function.variant.toml
+++ /dev/null
@@ -1,26 +0,0 @@
-namespace = "FlexFlow"
-name = "TaskImplFunction"
-features = [
-  "eq",
-  "fmt",
-  "hash",
-  "ord"
-]
-
-includes = [
-  "local-execution/init_op_task_impl_function.h",
-  "local-execution/fwd_bwd_op_task_impl_function.h",
-  "local-execution/generic_task_impl_function.h",
-]
-
-[[values]]
-type = "::FlexFlow::InitOpTaskImplFunction"
-key = "init_op_task_impl_function"
-
-[[values]]
-type = "::FlexFlow::FwdBwdOpTaskImplFunction"
-key = "fwd_bwd_op_task_impl_function"
-
-[[values]]
-type = "::FlexFlow::GenericTaskImplFunction"
-key = "generic_task_impl_function"
diff --git a/lib/local-execution/include/local-execution/task_registry.struct.toml b/lib/local-execution/include/local-execution/task_registry.struct.toml
index c3784b617f..f5daa62090 100644
--- a/lib/local-execution/include/local-execution/task_registry.struct.toml
+++ b/lib/local-execution/include/local-execution/task_registry.struct.toml
@@ -7,7 +7,7 @@ features = [
 ]
 
 includes = [
-  "local-execution/task_signature_impl.dtg.h",
+  "task-spec/task_signature_impl.dtg.h",
   "task-spec/task_id_t.dtg.h",
   "pcg/layer_guid_t.dtg.h",
 ]
diff --git a/lib/local-execution/include/local-execution/task_signature_impl.h b/lib/local-execution/include/local-execution/task_signature_impl.h
deleted file mode 100644
index 613a173f25..0000000000
--- a/lib/local-execution/include/local-execution/task_signature_impl.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_IMPL_H
-#define _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_IMPL_H
-
-#include "local-execution/task_signature_impl.dtg.h"
-#include "op-attrs/computation_graph_op_attrs.h"
-#include "task-spec/op_task_invocation.h"
-#include "task-spec/task_id_t.dtg.h"
-
-namespace FlexFlow {
-
-TaskSignatureAndImpl get_task_sig_impl(task_id_t const &);
-std::vector<task_id_t> get_task_ids(ComputationGraphOpAttrs const &);
-
-OpTaskInvocation init(ComputationGraphOpAttrs const &);
-OpTaskInvocation forward(ComputationGraphOpAttrs const &);
-OpTaskInvocation backward(ComputationGraphOpAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/task_signature_impl.struct.toml b/lib/local-execution/include/local-execution/task_signature_impl.struct.toml
deleted file mode 100644
index 78064203ec..0000000000
--- a/lib/local-execution/include/local-execution/task_signature_impl.struct.toml
+++ /dev/null
@@ -1,20 +0,0 @@
-namespace = "FlexFlow"
-name = "TaskSignatureAndImpl"
-features = [
-  "eq",
-  "fmt",
-  "hash"
-]
-
-includes = [
-  "local-execution/task_impl_function.dtg.h",
-  "task-spec/op_task_signature.h",
-]
-
-[[fields]]
-name = "impl_function"
-type = "::FlexFlow::TaskImplFunction"
-
-[[fields]]
-name = "task_signature"
-type = "::FlexFlow::OpTaskSignature"
diff --git a/lib/local-execution/include/local-execution/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h
index 731e04fdc8..f697337c52 100644
--- a/lib/local-execution/include/local-execution/tracked_allocator.h
+++ b/lib/local-execution/include/local-execution/tracked_allocator.h
@@ -13,6 +13,9 @@ struct TrackedAllocator : public IAllocator {
 
   void *allocate(size_t) override;
   void deallocate(void *) override;
+
+  DeviceType get_allocation_device_type() const override;
+
   size_t get_current_mem_usage();
 
 private:
diff --git a/lib/local-execution/src/allocated_tensors.cc b/lib/local-execution/src/allocated_tensors.cc
index 196da16ace..ffaeaf285f 100644
--- a/lib/local-execution/src/allocated_tensors.cc
+++ b/lib/local-execution/src/allocated_tensors.cc
@@ -35,7 +35,8 @@ bool are_allocated_forward_tensors_valid(
       if (!is_allocated_tensor_backing_valid(
               TensorTypeVariant{tensor_guid},
               allocated_tensors.tensor_type_backings,
-              ArrayShape{tensor_attrs.at(tensor_guid).shape})) {
+              array_shape_from_tensor_shape(
+                  tensor_attrs.at(tensor_guid).shape))) {
         return false;
       }
     } else {
@@ -58,8 +59,8 @@ bool are_allocated_gradient_tensors_valid(
         return false;
       }
 
-      ArrayShape tensor_guid_array_shape =
-          ArrayShape{tensor_attrs.at(tensor_to_grad.first).shape};
+      ArrayShape tensor_guid_array_shape = array_shape_from_tensor_shape(
+          tensor_attrs.at(tensor_to_grad.first).shape);
       TensorTypeVariant gradient_tensor =
           TensorTypeVariant{tensor_to_grad.second};
       if (is_allocated_tensor_backing_valid(
@@ -100,8 +101,8 @@ bool are_allocated_optimizer_tensors_valid(
         return false;
       }
 
-      ArrayShape tensor_guid_array_shape =
-          ArrayShape{tensor_attrs.at(tensor_to_optimizers.first).shape};
+      ArrayShape tensor_guid_array_shape = array_shape_from_tensor_shape(
+          tensor_attrs.at(tensor_to_optimizers.first).shape);
       for (optimizer_tensor_t const &optimizer_tensor :
            tensor_to_optimizers.second) {
         if (is_allocated_tensor_backing_valid(
diff --git a/lib/local-execution/src/fwd_bwd_op_task_impl_function.cc b/lib/local-execution/src/fwd_bwd_op_task_impl_function.cc
deleted file mode 100644
index 308dbfd3ae..0000000000
--- a/lib/local-execution/src/fwd_bwd_op_task_impl_function.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-#include "local-execution/fwd_bwd_op_task_impl_function.h"
-
-namespace FlexFlow {
-
-bool FwdBwdOpTaskImplFunction::operator==(
-    FwdBwdOpTaskImplFunction const &other) const {
-  return this->function_ptr == other.function_ptr;
-}
-
-bool FwdBwdOpTaskImplFunction::operator!=(
-    FwdBwdOpTaskImplFunction const &other) const {
-  return this->function_ptr != other.function_ptr;
-}
-
-bool FwdBwdOpTaskImplFunction::operator<(
-    FwdBwdOpTaskImplFunction const &other) const {
-  return this->function_ptr < other.function_ptr;
-}
-
-bool FwdBwdOpTaskImplFunction::operator>(
-    FwdBwdOpTaskImplFunction const &other) const {
-  return this->function_ptr > other.function_ptr;
-}
-
-bool FwdBwdOpTaskImplFunction::operator<=(
-    FwdBwdOpTaskImplFunction const &other) const {
-  return this->function_ptr <= other.function_ptr;
-}
-
-bool FwdBwdOpTaskImplFunction::operator>=(
-    FwdBwdOpTaskImplFunction const &other) const {
-  return this->function_ptr >= other.function_ptr;
-}
-
-std::string format_as(FwdBwdOpTaskImplFunction const &x) {
-  std::ostringstream oss;
-  oss << "<FwdBwdOpTaskImplFunction";
-  oss << " function_ptr=" << reinterpret_cast<void *>(x.function_ptr);
-  oss << ">";
-  return oss.str();
-}
-
-std::ostream &operator<<(std::ostream &s, FwdBwdOpTaskImplFunction const &x) {
-  return s << fmt::to_string(x);
-}
-
-} // namespace FlexFlow
-
-namespace std {
-size_t hash<FlexFlow::FwdBwdOpTaskImplFunction>::operator()(
-    ::FlexFlow::FwdBwdOpTaskImplFunction const &x) const {
-  return std::hash<decltype(x.function_ptr)>{}(x.function_ptr);
-}
-} // namespace std
diff --git a/lib/local-execution/src/generic_task_impl_function.cc b/lib/local-execution/src/generic_task_impl_function.cc
deleted file mode 100644
index 87d4db53e6..0000000000
--- a/lib/local-execution/src/generic_task_impl_function.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-#include "local-execution/generic_task_impl_function.h"
-
-namespace FlexFlow {
-
-bool GenericTaskImplFunction::operator==(
-    GenericTaskImplFunction const &other) const {
-  return this->function_ptr == other.function_ptr;
-}
-
-bool GenericTaskImplFunction::operator!=(
-    GenericTaskImplFunction const &other) const {
-  return this->function_ptr != other.function_ptr;
-}
-
-bool GenericTaskImplFunction::operator<(
-    GenericTaskImplFunction const &other) const {
-  return this->function_ptr < other.function_ptr;
-}
-
-bool GenericTaskImplFunction::operator>(
-    GenericTaskImplFunction const &other) const {
-  return this->function_ptr > other.function_ptr;
-}
-
-bool GenericTaskImplFunction::operator<=(
-    GenericTaskImplFunction const &other) const {
-  return this->function_ptr <= other.function_ptr;
-}
-
-bool GenericTaskImplFunction::operator>=(
-    GenericTaskImplFunction const &other) const {
-  return this->function_ptr >= other.function_ptr;
-}
-
-std::string format_as(GenericTaskImplFunction const &x) {
-  std::ostringstream oss;
-  oss << "<GenericTaskImplFunction";
-  oss << " function_ptr=" << reinterpret_cast<void *>(x.function_ptr);
-  oss << ">";
-  return oss.str();
-}
-std::ostream &operator<<(std::ostream &s, GenericTaskImplFunction const &x) {
-  return s << fmt::to_string(x);
-}
-
-} // namespace FlexFlow
-
-namespace std {
-size_t hash<FlexFlow::GenericTaskImplFunction>::operator()(
-    ::FlexFlow::GenericTaskImplFunction const &x) const {
-  return std::hash<decltype(x.function_ptr)>{}(x.function_ptr);
-}
-} // namespace std
diff --git a/lib/local-execution/src/init_op_task_impl_function.cc b/lib/local-execution/src/init_op_task_impl_function.cc
deleted file mode 100644
index abe84b828e..0000000000
--- a/lib/local-execution/src/init_op_task_impl_function.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-#include "local-execution/init_op_task_impl_function.h"
-
-namespace FlexFlow {
-
-bool InitOpTaskImplFunction::operator==(
-    InitOpTaskImplFunction const &other) const {
-  return this->function_ptr == other.function_ptr;
-}
-
-bool InitOpTaskImplFunction::operator!=(
-    InitOpTaskImplFunction const &other) const {
-  return this->function_ptr != other.function_ptr;
-}
-
-bool InitOpTaskImplFunction::operator<(
-    InitOpTaskImplFunction const &other) const {
-  return this->function_ptr < other.function_ptr;
-}
-
-bool InitOpTaskImplFunction::operator>(
-    InitOpTaskImplFunction const &other) const {
-  return this->function_ptr > other.function_ptr;
-}
-
-bool InitOpTaskImplFunction::operator<=(
-    InitOpTaskImplFunction const &other) const {
-  return this->function_ptr <= other.function_ptr;
-}
-
-bool InitOpTaskImplFunction::operator>=(
-    InitOpTaskImplFunction const &other) const {
-  return this->function_ptr >= other.function_ptr;
-}
-
-std::string format_as(InitOpTaskImplFunction const &x) {
-  std::ostringstream oss;
-  oss << "<InitOpTaskImplFunction";
-  oss << " function_ptr=" << reinterpret_cast<void *>(x.function_ptr);
-  oss << ">";
-  return oss.str();
-}
-std::ostream &operator<<(std::ostream &s, InitOpTaskImplFunction const &x) {
-  return s << fmt::to_string(x);
-}
-
-} // namespace FlexFlow
-
-namespace std {
-size_t hash<FlexFlow::InitOpTaskImplFunction>::operator()(
-    ::FlexFlow::InitOpTaskImplFunction const &x) const {
-  return std::hash<decltype(x.function_ptr)>{}(x.function_ptr);
-}
-} // namespace std
diff --git a/lib/local-execution/src/local-execution/ops/attention.cc b/lib/local-execution/src/local-execution/ops/attention.cc
deleted file mode 100644
index a9e6a9fa30..0000000000
--- a/lib/local-execution/src/local-execution/ops/attention.cc
+++ /dev/null
@@ -1,259 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "local-execution/ops/attention.h"
-#include "kernels/attention_kernels.h"
-#include "op-attrs/ops/attention.h"
-#include "op-attrs/ops/attention/multihead_attention_parallel_inputs.h"
-#include "task-spec/op_task_signature.h"
-
-namespace FlexFlow {
-
-using namespace FlexFlow::Kernels::MultiHeadAttention;
-
-enum Slots {
-  QUERY_PARALLEL_TENSOR_SHAPE,
-  KEY_PARALLEL_TENSOR_SHAPE,
-  VALUE_PARALLEL_TENSOR_SHAPE,
-  QPROJSIZE,
-  KPROJSIZE,
-  VPROJSIZE,
-  OPROJSIZE,
-  ATTRS,
-  PROFILING,
-  QUERY,
-  KEY,
-  VALUE,
-  WEIGHTS,
-  OUTPUT,
-  HANDLE,
-  PER_DEVICE_STATE
-};
-
-OpTaskInvocation init(MultiHeadAttentionAttrs const &attrs) {
-  OpTaskBinding b;
-
-  b.bind_arg(HANDLE, ff_handle());
-  b.bind_arg(ATTRS, attrs);
-
-  b.bind_arg(QUERY_PARALLEL_TENSOR_SHAPE, input_parallel_tensor_shape(0));
-  b.bind_arg(KEY_PARALLEL_TENSOR_SHAPE, input_parallel_tensor_shape(1));
-  b.bind_arg(VALUE_PARALLEL_TENSOR_SHAPE, input_parallel_tensor_shape(2));
-
-  b.bind_arg(QPROJSIZE, get_qProjSize(attrs));
-  b.bind_arg(KPROJSIZE, get_kProjSize(attrs));
-  b.bind_arg(VPROJSIZE, get_vProjSize(attrs));
-  b.bind_arg(OPROJSIZE, get_oProjSize(attrs));
-
-  return {task_id_t::ATTENTION_INIT_TASK_ID, b};
-}
-
-OpTaskInvocation forward(MultiHeadAttentionAttrs const &attrs) {
-  OpTaskBinding b;
-
-  b.bind(QUERY, input_tensor(0));
-  b.bind(KEY, input_tensor(1));
-  b.bind(VALUE, input_tensor(2));
-  b.bind(WEIGHTS, weight_tensor(0));
-  b.bind(OUTPUT, output_tensor(0));
-
-  b.bind_arg(PROFILING, profiling_settings());
-  b.bind_arg(PER_DEVICE_STATE, per_device_op_state<MHAPerDeviceState>());
-
-  return {task_id_t::ATTENTION_FWD_TASK_ID, b};
-}
-
-OpTaskInvocation backward(MultiHeadAttentionAttrs const &attrs) {
-  OpTaskBinding b = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::ATTENTION_BWD_TASK_ID, b};
-}
-
-static DeviceSpecificDeviceStates
-    init_task_impl(TaskArgumentAccessor const &acc) {
-  auto const &attrs = acc.get_argument<MultiHeadAttentionAttrs>(ATTRS);
-  Allocator allocator = acc.get_allocator();
-  nonnegative_int qProjSize = acc.get_argument<nonnegative_int>(QPROJSIZE);
-  nonnegative_int kProjSize = acc.get_argument<nonnegative_int>(KPROJSIZE);
-  nonnegative_int vProjSize = acc.get_argument<nonnegative_int>(VPROJSIZE);
-  nonnegative_int oProjSize = acc.get_argument<nonnegative_int>(OPROJSIZE);
-
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
-  ParallelTensorShape query_parallel_tensor_shape =
-      acc.get_argument<ParallelTensorShape>(QUERY_PARALLEL_TENSOR_SHAPE);
-  ParallelTensorShape key_parallel_tensor_shape =
-      acc.get_argument<ParallelTensorShape>(KEY_PARALLEL_TENSOR_SHAPE);
-  ParallelTensorShape value_parallel_tensor_shape =
-      acc.get_argument<ParallelTensorShape>(VALUE_PARALLEL_TENSOR_SHAPE);
-
-  MultiHeadAttentionParallelInputs parsed = throw_if_unexpected(
-      parse_attention_parallel_input_shape(query_parallel_tensor_shape,
-                                           key_parallel_tensor_shape,
-                                           value_parallel_tensor_shape));
-  ParallelTensorShape weight_parallel_tensor_shape =
-      throw_if_unexpected(get_weights_shape(attrs,
-                                            query_parallel_tensor_shape,
-                                            key_parallel_tensor_shape,
-                                            value_parallel_tensor_shape));
-
-  nonnegative_int kvSeqLength = get_kvSeqLength(parsed);
-  nonnegative_int qSize = get_qSize(parsed);
-  nonnegative_int kSize = get_kSize(parsed);
-  nonnegative_int vSize = get_vSize(parsed);
-
-  nonnegative_int qoSeqLength = get_qoSeqLength(parsed);
-  nonnegative_int num_samples = get_num_samples(parsed);
-  nonnegative_int num_heads = attrs.num_heads;
-
-  MHAPerDeviceState per_device_state =
-      init_kernel(handle,
-                  allocator,
-                  num_samples.unwrap_nonnegative(),
-                  num_heads.unwrap_nonnegative(),
-                  qSize.unwrap_nonnegative(),
-                  kSize.unwrap_nonnegative(),
-                  vSize.unwrap_nonnegative(),
-                  qProjSize.unwrap_nonnegative(),
-                  kProjSize.unwrap_nonnegative(),
-                  vProjSize.unwrap_nonnegative(),
-                  oProjSize.unwrap_nonnegative(),
-                  qoSeqLength.unwrap_nonnegative(),
-                  kvSeqLength.unwrap_nonnegative(),
-                  attrs.add_bias_kv);
-  return DeviceSpecificDeviceStates{
-      DeviceSpecific<MHAPerDeviceState>::create(per_device_state)};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  auto query = acc.get_tensor<Permissions::RO>(QUERY);
-  auto key = acc.get_tensor<Permissions::RO>(KEY);
-  auto value = acc.get_tensor<Permissions::RO>(VALUE);
-  auto weight = acc.get_tensor<Permissions::RO>(WEIGHTS);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  MHAPerDeviceState per_device_state =
-      acc.get_argument<MHAPerDeviceState>(PER_DEVICE_STATE);
-
-  return profile(forward_kernel,
-                 profiling,
-                 "[MultiHeadAttention] forward_time = {:.2lf}ms\n",
-                 per_device_state,
-                 query.get_float_ptr(),
-                 key.get_float_ptr(),
-                 value.get_float_ptr(),
-                 weight.get_float_ptr(),
-                 output.get_float_ptr());
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  auto query = acc.get_tensor<Permissions::RO>(QUERY);
-  auto key = acc.get_tensor<Permissions::RO>(KEY);
-  auto value = acc.get_tensor<Permissions::RO>(VALUE);
-  auto weight = acc.get_tensor<Permissions::RO>(WEIGHTS);
-
-  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
-  auto weight_grad = acc.get_tensor_grad<Permissions::RW>(WEIGHTS);
-  auto query_grad = acc.get_tensor_grad<Permissions::RW>(QUERY);
-  auto key_grad = acc.get_tensor_grad<Permissions::RW>(KEY);
-  auto value_grad = acc.get_tensor_grad<Permissions::RW>(VALUE);
-
-  MHAPerDeviceState per_device_state =
-      acc.get_argument<MHAPerDeviceState>(PER_DEVICE_STATE);
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-
-  float *key_grad_ptr =
-      (key_grad == query_grad) ? nullptr : key_grad.get_float_ptr();
-  float *value_grad_ptr = (value_grad == query_grad || value_grad == key_grad)
-                              ? nullptr
-                              : value_grad.get_float_ptr();
-
-  assert(value_grad.shape == value.shape);
-  assert(key_grad.shape == key.shape);
-
-  assert(query_grad.shape == query.shape);
-  assert(weight_grad.shape.get_volume() == weight.shape.get_volume());
-
-  return profile(backward_kernel,
-                 profiling,
-                 "[MultiHeadAttention] backward_time = {:.2lf}ms\n",
-                 per_device_state,
-                 query.get_float_ptr(),
-                 query_grad.get_float_ptr(),
-                 key.get_float_ptr(),
-                 key_grad_ptr,
-                 value.get_float_ptr(),
-                 value_grad_ptr,
-                 weight.get_float_ptr(),
-                 weight_grad.get_float_ptr(),
-                 output_grad.get_float_ptr());
-}
-
-TaskImplFunction get_attention_init_task_impl() {
-  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
-}
-TaskImplFunction get_attention_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-TaskImplFunction get_attention_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-OpTaskSignature get_attention_init_signature() {
-  OpTaskSignature init(OpTaskType::INIT);
-  init.add_arg_slot<ParallelTensorShape>(QUERY_PARALLEL_TENSOR_SHAPE);
-  init.add_arg_slot<ParallelTensorShape>(KEY_PARALLEL_TENSOR_SHAPE);
-  init.add_arg_slot<ParallelTensorShape>(VALUE_PARALLEL_TENSOR_SHAPE);
-  init.add_arg_slot<int>(QPROJSIZE);
-  init.add_arg_slot<int>(KPROJSIZE);
-  init.add_arg_slot<int>(VPROJSIZE);
-  init.add_arg_slot<int>(OPROJSIZE);
-  init.add_arg_slot<MultiHeadAttentionAttrs>(ATTRS);
-  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
-
-  init.add_return_value<MHAPerDeviceState>();
-
-  return init;
-}
-
-OpTaskSignature get_attention_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_input_slot(QUERY);
-  fwd.add_input_slot(KEY);
-  fwd.add_input_slot(VALUE);
-  fwd.add_weight_slot(WEIGHTS);
-  fwd.add_output_slot(OUTPUT);
-
-  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
-  fwd.add_unchecked_arg_slot<MHAPerDeviceState>(PER_DEVICE_STATE);
-
-  return fwd;
-}
-
-OpTaskSignature get_attention_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_attention_fwd_signature());
-
-  return bwd;
-}
-
-std::vector<task_id_t> get_task_ids(MultiHeadAttentionAttrs const &) {
-  return {task_id_t::ATTENTION_INIT_TASK_ID,
-          task_id_t::ATTENTION_FWD_TASK_ID,
-          task_id_t::ATTENTION_BWD_TASK_ID};
-}
-
-} // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/batch_matmul.cc b/lib/local-execution/src/local-execution/ops/batch_matmul.cc
deleted file mode 100644
index 2cbf1cf20f..0000000000
--- a/lib/local-execution/src/local-execution/ops/batch_matmul.cc
+++ /dev/null
@@ -1,194 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "local-execution/ops/batch_matmul.h"
-#include "kernels/batch_matmul_kernels.h"
-#include "op-attrs/ops/batch_matmul.h"
-#include "task-spec/op_task_signature.h"
-#include "utils/containers/transform.h"
-#include "utils/nonnegative_int/nonnegative_range.h"
-
-namespace FlexFlow {
-
-using namespace FlexFlow::Kernels::BatchMatmul;
-
-enum Slots {
-  A_INPUT, // tensor
-  B_INPUT, // tensor
-  ATTRS,
-  OUTPUT, // tensor
-  PROFILING,
-  HANDLE,
-  ITERATION_CONFIG
-};
-
-OpTaskInvocation forward(BatchMatmulAttrs const &attrs) {
-  OpTaskBinding fwd;
-
-  fwd.bind(A_INPUT, input_tensor(0));
-  fwd.bind(B_INPUT, input_tensor(1));
-  fwd.bind(OUTPUT, output_tensor(0));
-
-  fwd.bind_arg(ATTRS, attrs);
-  fwd.bind_arg(HANDLE, ff_handle());
-  fwd.bind_arg(PROFILING, profiling_settings());
-  fwd.bind_arg(ITERATION_CONFIG, iteration_config());
-
-  return {task_id_t::BATCHMATMUL_FWD_TASK_ID, fwd};
-}
-
-OpTaskInvocation backward(BatchMatmulAttrs const &attrs) {
-  OpTaskBinding bwd = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::BATCHMATMUL_BWD_TASK_ID, bwd};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  auto a_input = acc.get_tensor<Permissions::RO>(A_INPUT);
-  auto b_input = acc.get_tensor<Permissions::RO>(B_INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  auto attrs = acc.get_argument<BatchMatmulAttrs>(ATTRS);
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
-
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  FFIterationConfig iter_config =
-      acc.get_argument<FFIterationConfig>(ITERATION_CONFIG);
-
-  nonnegative_int m = b_input.shape.at(legion_dim_t{0_n});
-  assert(m == output.shape.at(legion_dim_t{0_n}));
-  nonnegative_int n = a_input.shape.at(legion_dim_t{1_n});
-  assert(n == output.shape.at(legion_dim_t{1_n}));
-  nonnegative_int k = a_input.shape.at(legion_dim_t{0_n});
-  assert(k == b_input.shape.at(legion_dim_t{1_n}));
-
-  assert(a_input.shape.get_volume() == b_input.shape.get_volume());
-  assert(a_input.shape.get_volume() == output.shape.get_volume());
-
-  nonnegative_int batch = 1_n;
-  for (nonnegative_int i : nonnegative_range(2_n, a_input.shape.get_dim())) {
-    nonnegative_int dim_size = a_input.shape.at(legion_dim_t{i});
-    assert(dim_size == b_input.shape.at(legion_dim_t{i}));
-    assert(dim_size == output.shape.at(legion_dim_t{i}));
-    batch *= dim_size;
-  }
-
-  auto get_raw_seq_len = [](std::optional<nonnegative_int> seq_len) -> int {
-    return transform(seq_len,
-                     [](nonnegative_int x) { return x.unwrap_nonnegative(); })
-        .value_or(-1);
-  };
-
-  return profile(forward_kernel,
-                 profiling,
-                 "[BatchMatmul] forward_time = {:.2lf}ms\n",
-                 handle,
-                 output.get_float_ptr(),
-                 a_input.get_float_ptr(),
-                 b_input.get_float_ptr(),
-                 m.unwrap_nonnegative(),
-                 n.unwrap_nonnegative(),
-                 k.unwrap_nonnegative(),
-                 batch.unwrap_nonnegative(),
-                 get_raw_seq_len(attrs.a_seq_length_dim),
-                 get_raw_seq_len(attrs.b_seq_length_dim),
-                 iter_config.seq_length);
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  // BatchMatmul* bmm = (BatchMatmul*) task->args;
-  FFIterationConfig iter_config =
-      acc.get_argument<FFIterationConfig>(ITERATION_CONFIG);
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
-
-  auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::RW>(OUTPUT);
-  assert(output.shape == output_grad.shape);
-
-  auto a_input = acc.get_tensor<Permissions::RO>(A_INPUT);
-  auto a_input_grad = acc.get_tensor_grad<Permissions::RW>(A_INPUT);
-  assert(a_input.shape == a_input_grad.shape);
-
-  auto b_input = acc.get_tensor<Permissions::RO>(B_INPUT);
-  auto b_input_grad = acc.get_tensor_grad<Permissions::RW>(B_INPUT);
-  assert(b_input.shape == b_input_grad.shape);
-
-  // check dins
-  nonnegative_int m = b_input.shape.at(legion_dim_t{0_n});
-  assert(m == output.shape.at(legion_dim_t{0_n}));
-  nonnegative_int n = a_input.shape.at(legion_dim_t{1_n});
-  assert(n == output.shape.at(legion_dim_t{1_n}));
-  nonnegative_int k = a_input.shape.at(legion_dim_t{0_n});
-  assert(k == b_input.shape.at(legion_dim_t{1_n}));
-  assert(a_input.shape.get_volume() == b_input.shape.get_volume());
-  assert(a_input.shape.get_volume() == output.shape.get_volume());
-
-  nonnegative_int batch = 1_n;
-  for (nonnegative_int i : nonnegative_range(2_n, a_input.shape.get_dim())) {
-    nonnegative_int dim_size = a_input.shape.at(legion_dim_t{i});
-    assert(dim_size == b_input.shape.at(legion_dim_t{i}));
-    assert(dim_size == output.shape.at(legion_dim_t{i}));
-    batch *= dim_size;
-  }
-
-  return profile(backward_kernel,
-                 profiling,
-                 "[BatchMatmul] backward_time = {:.2lf}ms\n",
-                 handle,
-                 output.get_float_ptr(),
-                 output_grad.get_float_ptr(),
-                 a_input.get_float_ptr(),
-                 a_input_grad.get_float_ptr(),
-                 b_input.get_float_ptr(),
-                 b_input_grad.get_float_ptr(),
-                 m.unwrap_nonnegative(),
-                 n.unwrap_nonnegative(),
-                 k.unwrap_nonnegative(),
-                 batch.unwrap_nonnegative());
-}
-
-TaskImplFunction get_batch_matmul_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-TaskImplFunction get_batch_matmul_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-OpTaskSignature get_batch_matmul_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_input_slot(A_INPUT);
-  fwd.add_input_slot(B_INPUT);
-  fwd.add_output_slot(OUTPUT);
-  fwd.add_arg_slot<BatchMatmulAttrs>(ATTRS);
-  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
-  fwd.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
-
-  return fwd;
-}
-
-OpTaskSignature get_batch_matmul_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_batch_matmul_fwd_signature());
-
-  return bwd;
-}
-
-std::vector<task_id_t> get_task_ids(BatchMatmulAttrs const &) {
-  return {task_id_t::BATCHMATMUL_FWD_TASK_ID,
-          task_id_t::BATCHMATMUL_BWD_TASK_ID};
-}
-
-}; // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/batch_norm.cc b/lib/local-execution/src/local-execution/ops/batch_norm.cc
deleted file mode 100644
index 97dcb6e103..0000000000
--- a/lib/local-execution/src/local-execution/ops/batch_norm.cc
+++ /dev/null
@@ -1,196 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "local-execution/ops/batch_norm.h"
-#include "kernels/batch_norm_kernels.h"
-
-namespace FlexFlow {
-
-using namespace FlexFlow::Kernels::BatchNorm;
-
-enum Slots {
-  INPUT,  // tensor
-  SCALE,  // tensor
-  BIAS,   // tensor
-  OUTPUT, // tensor
-  ATTRS,
-  PROFILING,
-  PER_DEVICE_STATE,
-  RELU,
-  HANDLE
-};
-
-OpTaskInvocation init(BatchNormAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(BIAS, input_tensor(2));
-  binding.bind(OUTPUT, output_tensor(0));
-
-  binding.bind_arg(ATTRS, attrs);
-  binding.bind_arg(PROFILING, profiling_settings());
-  binding.bind_arg(HANDLE, ff_handle());
-
-  return {task_id_t::BATCHNORM_INIT_TASK_ID, binding};
-}
-
-OpTaskInvocation forward(BatchNormAttrs const &attrs) {
-  OpTaskBinding binding;
-  binding.bind_arg(PROFILING, profiling_settings());
-  binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_op_state<BatchNormPerDeviceState>());
-
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(SCALE, input_tensor(1));
-  binding.bind(BIAS, input_tensor(2));
-  binding.bind(OUTPUT, output_tensor(0));
-
-  return {task_id_t::BATCHNORM_FWD_TASK_ID, binding};
-}
-
-OpTaskInvocation backward(BatchNormAttrs const &attrs) {
-  OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::BATCHNORM_BWD_TASK_ID, binding};
-}
-
-static DeviceSpecificDeviceStates
-    init_task_impl(TaskArgumentAccessor const &acc) {
-  Allocator allocator = acc.get_allocator();
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  auto const &attrs = acc.get_argument<BatchNormAttrs>(ATTRS);
-
-  nonnegative_int output_w = output.shape.at(legion_dim_t{0_n});
-  nonnegative_int output_h = output.shape.at(legion_dim_t{1_n});
-  nonnegative_int output_c = output.shape.at(legion_dim_t{2_n});
-  nonnegative_int output_n = output.shape.at(legion_dim_t{3_n});
-
-  float *runningMean;
-
-  BatchNormPerDeviceState per_device_state =
-      init_kernel(handle,
-                  allocator,
-                  runningMean,
-                  output_n.unwrap_nonnegative(),
-                  output_c.unwrap_nonnegative(),
-                  output_h.unwrap_nonnegative(),
-                  output_w.unwrap_nonnegative(),
-                  attrs.relu);
-
-  return DeviceSpecificDeviceStates{
-      DeviceSpecific<BatchNormPerDeviceState>::create(per_device_state)};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  auto per_device_state =
-      acc.get_argument<BatchNormPerDeviceState>(PER_DEVICE_STATE);
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  auto scale = acc.get_tensor<Permissions::RO>(SCALE);
-  auto bias = acc.get_tensor<Permissions::RO>(SCALE);
-
-  return profile(forward_kernel,
-                 profiling,
-                 "[BatchNorm] forward_time = {:.2lf}ms\n",
-                 per_device_state,
-                 input.get_float_ptr(),
-                 output.get_float_ptr(),
-                 scale.get_float_ptr(),
-                 bias.get_float_ptr());
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  auto per_device_state =
-      acc.get_argument<BatchNormPerDeviceState>(PER_DEVICE_STATE);
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
-  auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::RW>(OUTPUT);
-  auto scale = acc.get_tensor<Permissions::RO>(SCALE);
-  auto scale_grad = acc.get_tensor_grad<Permissions::RW>(SCALE);
-  auto bias_grad = acc.get_tensor_grad<Permissions::RW>(BIAS);
-
-  return profile(backward_kernel,
-                 profiling,
-                 "[BatchNorm] backward_time = {:.2lf}ms\n",
-                 per_device_state,
-                 input.get_float_ptr(),
-                 output_grad.get_float_ptr(),
-                 output.get_float_ptr(),
-                 input_grad.get_float_ptr(),
-                 scale.get_float_ptr(),
-                 scale_grad.get_float_ptr(),
-                 bias_grad.get_float_ptr(),
-                 output.shape.get_volume().unwrap_nonnegative());
-}
-
-TaskImplFunction get_batch_norm_init_task_impl() {
-  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
-}
-TaskImplFunction get_batch_norm_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-TaskImplFunction get_batch_norm_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-OpTaskSignature get_batch_norm_init_signature() {
-  OpTaskSignature init(OpTaskType::INIT);
-
-  init.add_input_slot(INPUT);
-  init.add_input_slot(BIAS);
-  init.add_output_slot(OUTPUT);
-  init.add_arg_slot<BatchNormAttrs>(ATTRS);
-  init.add_arg_slot<bool>(PROFILING);
-  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
-
-  return init;
-}
-
-OpTaskSignature get_batch_norm_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_input_slot(INPUT);
-  fwd.add_input_slot(SCALE);
-  fwd.add_input_slot(BIAS);
-  fwd.add_output_slot(OUTPUT);
-  fwd.add_arg_slot<bool>(PROFILING);
-  fwd.add_unchecked_arg_slot<BatchNormPerDeviceState>(PER_DEVICE_STATE);
-
-  return fwd;
-}
-OpTaskSignature get_batch_norm_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_batch_norm_fwd_signature());
-
-  return bwd;
-}
-
-std::vector<task_id_t> get_task_ids(BatchNormAttrs const &) {
-  return {
-      task_id_t::BATCHNORM_INIT_TASK_ID,
-      task_id_t::BATCHNORM_FWD_TASK_ID,
-      task_id_t::BATCHNORM_BWD_TASK_ID,
-  };
-}
-
-}; // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/cast.cc b/lib/local-execution/src/local-execution/ops/cast.cc
deleted file mode 100644
index e5dd7f9c4e..0000000000
--- a/lib/local-execution/src/local-execution/ops/cast.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "local-execution/ops/cast.h"
-#include "kernels/cast_kernels.h"
-
-#include "task-spec/op_task_signature.h"
-#include "utils/hash-utils.h"
-
-using namespace FlexFlow::Kernels::Cast;
-
-namespace FlexFlow {
-
-enum Slots { INPUT, OUTPUT, ATTRS, PROFILING };
-
-OpTaskInvocation forward(CastAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(PROFILING, profiling_settings());
-  binding.bind_arg(ATTRS, attrs);
-
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-
-  return {task_id_t::CAST_FWD_TASK_ID, binding};
-}
-
-OpTaskInvocation backward(CastAttrs const &attrs) {
-  OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::CAST_BWD_TASK_ID, binding};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto const &attrs = acc.get_argument<CastAttrs>(ATTRS);
-
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-
-  return profile(forward_kernel,
-                 profiling,
-                 "[Cast] forward_time = {:.2lf}ms\n",
-                 input,
-                 output,
-                 input.data_type,
-                 attrs.dtype);
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto const &attrs = acc.get_argument<CastAttrs>(ATTRS);
-
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-
-  auto input_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
-
-  return profile(backward_kernel,
-                 profiling,
-                 "[Cast] forward_time = {:.2lf}ms\n",
-                 input_grad,
-                 output_grad,
-                 input.data_type,
-                 attrs.dtype);
-}
-
-TaskImplFunction get_cast_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-TaskImplFunction get_cast_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-OpTaskSignature get_cast_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_arg_slot<CastAttrs>(ATTRS);
-  fwd.add_arg_slot<bool>(PROFILING);
-
-  fwd.add_input_slot(INPUT);
-  fwd.add_output_slot(OUTPUT);
-
-  return fwd;
-}
-
-OpTaskSignature get_cast_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_cast_fwd_signature());
-
-  return bwd;
-}
-
-std::vector<task_id_t> get_task_ids(CastAttrs const &) {
-  return {task_id_t::CAST_FWD_TASK_ID, task_id_t::CAST_BWD_TASK_ID};
-}
-
-}; // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/combine.cc b/lib/local-execution/src/local-execution/ops/combine.cc
deleted file mode 100644
index 32fab636d3..0000000000
--- a/lib/local-execution/src/local-execution/ops/combine.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "local-execution/ops/combine.h"
-#include "kernels/combine_kernels.h"
-#include "task-spec/op_task_invocation.h"
-#include "utils/hash-utils.h"
-
-namespace FlexFlow {
-
-using namespace FlexFlow::Kernels::Combine;
-
-enum Slots { INPUT, OUTPUT, PROFILING };
-
-OpTaskInvocation forward(CombineAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(PROFILING, profiling_settings());
-
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-
-  return {task_id_t::COMBINE_FWD_TASK_ID, binding};
-}
-
-OpTaskInvocation backward(CombineAttrs const &attrs) {
-  OpTaskBinding b = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::COMBINE_BWD_TASK_ID, b};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-
-  return profile(forward_kernel,
-                 profiling,
-                 "[Combine] forward_time = {:.2lf}ms\n",
-                 input,
-                 output);
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-
-  auto input_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
-
-  return profile(backward_kernel,
-                 profiling,
-                 "[Combine] backward_time = {:.2lf}ms\n",
-                 input_grad,
-                 output_grad);
-}
-
-OpTaskSignature get_combine_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_arg_slot<bool>(PROFILING);
-  fwd.add_input_slot(INPUT);
-  fwd.add_output_slot(OUTPUT);
-
-  return fwd;
-}
-
-OpTaskSignature get_combine_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_combine_fwd_signature());
-
-  return bwd;
-}
-
-TaskImplFunction get_combine_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-TaskImplFunction get_combine_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-}; // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/concat.cc b/lib/local-execution/src/local-execution/ops/concat.cc
deleted file mode 100644
index 8531bf77c0..0000000000
--- a/lib/local-execution/src/local-execution/ops/concat.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "local-execution/ops/concat.h"
-#include "kernels/concat_kernels.h"
-#include "task-spec/op_task_signature.h"
-#include "task-spec/variadic_tensor_ref.h"
-#include "utils/hash-utils.h"
-
-namespace FlexFlow {
-
-using namespace FlexFlow::Kernels::Concat;
-
-enum Slots { INPUTS, OUTPUT, ATTRS, PROFILING, HANDLE, NUM_INPUTS };
-
-OpTaskInvocation forward(ConcatAttrs const &attrs) {
-  OpTaskBinding binding;
-  binding.bind(INPUTS, get_input_tensors());
-  binding.bind(OUTPUT, output_tensor(0));
-  binding.bind_arg(PROFILING, profiling_settings());
-  binding.bind_arg(ATTRS, attrs);
-
-  return {task_id_t::CONCAT_FWD_TASK_ID, binding};
-}
-
-OpTaskInvocation backward(ConcatAttrs const &attrs) {
-  OpTaskBinding b = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::CONCAT_BWD_TASK_ID, b};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto const &attrs = acc.get_argument<ConcatAttrs>(ATTRS);
-
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  auto inputs = acc.get_variadic_tensor<Permissions::RO>(INPUTS);
-
-  assert(inputs.size() <= MAX_NUM_INPUTS);
-
-  return profile(forward_kernel,
-                 profiling,
-                 "[Concat] forward_time = {:.2lf}ms\n",
-                 output,
-                 inputs,
-                 attrs.axis);
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto const &attrs = acc.get_argument<ConcatAttrs>(ATTRS);
-
-  auto input_grads = acc.get_variadic_tensor_grad<Permissions::RW>(INPUTS);
-  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
-
-  assert(input_grads.size() <= MAX_NUM_INPUTS);
-
-  return profile(backward_kernel,
-                 profiling,
-                 "[Concat] backward_time = {:.2lf}ms\n",
-                 output_grad,
-                 input_grads,
-                 attrs.axis);
-}
-
-TaskImplFunction get_concat_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-TaskImplFunction get_concat_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-OpTaskSignature get_concat_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_arg_slot<ConcatAttrs>(ATTRS);
-  fwd.add_arg_slot<bool>(PROFILING);
-  fwd.add_input_slot(INPUTS, SlotType::VARIADIC);
-  fwd.add_output_slot(OUTPUT);
-
-  return fwd;
-}
-
-OpTaskSignature get_concat_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_concat_fwd_signature());
-
-  return bwd;
-}
-
-std::vector<task_id_t> get_task_ids(ConcatAttrs const &) {
-  return {task_id_t::CONCAT_FWD_TASK_ID, task_id_t::CONCAT_BWD_TASK_ID};
-}
-
-}; // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/conv_2d.cc b/lib/local-execution/src/local-execution/ops/conv_2d.cc
deleted file mode 100644
index 49dbc4b4b1..0000000000
--- a/lib/local-execution/src/local-execution/ops/conv_2d.cc
+++ /dev/null
@@ -1,184 +0,0 @@
-#include "local-execution/ops/conv_2d.h"
-#include "kernels/conv_2d_kernels.h"
-
-namespace FlexFlow {
-
-using namespace FlexFlow::Kernels::Conv2D;
-
-enum Slots {
-  INPUT,
-  OUTPUT,
-  FILTER,
-  BIAS,
-  ATTRS,
-  PROFILING,
-  PER_DEVICE_STATE,
-  HANDLE
-};
-
-OpTaskInvocation init(Conv2DAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-  binding.bind(FILTER, weight_tensor(0));
-  binding.bind_arg(ATTRS, attrs);
-  binding.bind_arg(HANDLE, ff_handle());
-
-  return {task_id_t::CONV2D_INIT_TASK_ID, binding};
-}
-
-OpTaskInvocation forward(Conv2DAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(ATTRS, attrs);
-  binding.bind_arg(PROFILING, profiling_settings());
-  binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_op_state<Conv2DPerDeviceState>());
-
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-  binding.bind(FILTER, weight_tensor(0));
-  binding.bind(BIAS, weight_tensor(1));
-
-  return {task_id_t::CONV2D_FWD_TASK_ID, binding};
-}
-
-OpTaskInvocation backward(Conv2DAttrs const &attrs) {
-  OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::CONV2D_BWD_TASK_ID, binding};
-}
-
-static DeviceSpecificDeviceStates
-    init_task_impl(TaskArgumentAccessor const &acc) {
-
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
-  auto attrs = acc.get_argument<Conv2DAttrs>(ATTRS);
-  auto input = acc.get_tensor<Permissions::WO>(INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  auto filter = acc.get_tensor<Permissions::RO>(FILTER);
-  auto filter_grad = acc.get_tensor_grad<Permissions::RW>(FILTER);
-
-  Conv2DPerDeviceState per_device_state =
-      init_kernel(/*handle=*/handle,
-                  /*activation=*/attrs.activation,
-                  /*kernel_h=*/attrs.kernel_h.unwrap_nonnegative(),
-                  /*kernel_w=*/attrs.kernel_w.unwrap_nonnegative(),
-                  /*groups=*/attrs.groups.unwrap_nonnegative(),
-                  /*padding_h=*/attrs.padding_h.unwrap_nonnegative(),
-                  /*padding_w=*/attrs.padding_w.unwrap_nonnegative(),
-                  /*stride_h=*/attrs.stride_h.unwrap_nonnegative(),
-                  /*stride_w=*/attrs.stride_w.unwrap_nonnegative(),
-                  /*input=*/input,
-                  /*output=*/output,
-                  /*filter_ptr=*/filter.get_float_ptr(),
-                  /*filter_grad_ptr=*/filter_grad.get_float_ptr());
-  return DeviceSpecificDeviceStates{
-      DeviceSpecific<Conv2DPerDeviceState>::create(per_device_state)};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto per_device_state =
-      acc.get_argument<Conv2DPerDeviceState>(PER_DEVICE_STATE);
-  auto attrs = acc.get_argument<Conv2DAttrs>(ATTRS);
-
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto filter = acc.get_tensor<Permissions::RO>(FILTER);
-  auto bias = acc.get_tensor<Permissions::RO>(BIAS);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-
-  return profile(forward_kernel,
-                 profiling,
-                 "[Conv2d] forward_time = {:.2lf}ms\n",
-                 per_device_state,
-                 input.get_float_ptr(),
-                 output.get_float_ptr(),
-                 filter.get_float_ptr(),
-                 bias.get_float_ptr(),
-                 attrs.activation);
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto per_device_state =
-      acc.get_argument<Conv2DPerDeviceState>(PER_DEVICE_STATE);
-  auto attrs = acc.get_argument<Conv2DAttrs>(ATTRS);
-
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  auto filter = acc.get_tensor<Permissions::RO>(FILTER);
-
-  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::RW>(OUTPUT);
-  auto filter_grad = acc.get_tensor_grad<Permissions::RW>(FILTER);
-  auto bias_grad = acc.get_tensor_grad<Permissions::RW>(BIAS);
-
-  return profile(backward_kernel,
-                 profiling,
-                 "[Conv2d] backward_time = {:.2lf}ms\n",
-                 per_device_state,
-                 input.get_float_ptr(),
-                 input_grad.get_float_ptr(),
-                 output.get_float_ptr(),
-                 output_grad.get_float_ptr(),
-                 filter.get_float_ptr(),
-                 filter_grad.get_float_ptr(),
-                 bias_grad.get_float_ptr(),
-                 attrs.activation);
-}
-
-TaskImplFunction get_conv_2d_init_task_impl() {
-  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
-}
-TaskImplFunction get_conv_2d_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-TaskImplFunction get_conv_2d_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-OpTaskSignature get_conv_2d_init_signature() {
-  OpTaskSignature init(OpTaskType::INIT);
-
-  init.add_input_slot(INPUT);
-  init.add_output_slot(OUTPUT);
-  init.add_weight_slot(FILTER);
-  init.add_arg_slot<Conv2DAttrs>(ATTRS);
-  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
-
-  init.add_return_value<Conv2DPerDeviceState>();
-
-  return init;
-}
-
-OpTaskSignature get_conv_2d_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_arg_slot<bool>(PROFILING);
-  fwd.add_unchecked_arg_slot<Conv2DPerDeviceState>(PER_DEVICE_STATE);
-  fwd.add_arg_slot<Conv2DAttrs>(ATTRS);
-
-  fwd.add_input_slot(INPUT);
-  fwd.add_output_slot(OUTPUT);
-  fwd.add_weight_slot(FILTER);
-  fwd.add_weight_slot(BIAS);
-
-  return fwd;
-}
-
-OpTaskSignature get_conv_2d_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_conv_2d_fwd_signature());
-
-  return bwd;
-}
-
-std::vector<task_id_t> get_task_ids(Conv2DAttrs const &) {
-  return {task_id_t::CONV2D_INIT_TASK_ID,
-          task_id_t::CONV2D_FWD_TASK_ID,
-          task_id_t::CONV2D_BWD_TASK_ID};
-}
-
-} // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/dropout.cc b/lib/local-execution/src/local-execution/ops/dropout.cc
deleted file mode 100644
index cc09841190..0000000000
--- a/lib/local-execution/src/local-execution/ops/dropout.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-#include "local-execution/ops/dropout.h"
-#include "kernels/dropout_kernels.h"
-#include "task-spec/op_task_invocation.h"
-#include "task-spec/op_task_signature.h"
-#include "utils/hash-utils.h"
-
-namespace FlexFlow {
-
-using namespace FlexFlow::Kernels::Dropout;
-
-enum Slots { INPUT, OUTPUT, ATTRS, PER_DEVICE_STATE, FF_HANDLE, PROFILING };
-
-OpTaskInvocation init(DropoutAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(ATTRS, attrs);
-  binding.bind_arg(FF_HANDLE, ff_handle());
-  binding.bind(OUTPUT, output_tensor(0));
-
-  return {task_id_t::DROPOUT_INIT_TASK_ID, binding};
-}
-
-OpTaskInvocation forward(DropoutAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-
-  binding.bind_arg(PROFILING, profiling_settings());
-  binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_op_state<DropoutPerDeviceState>());
-
-  return {task_id_t::DROPOUT_FWD_TASK_ID, binding};
-}
-
-OpTaskInvocation backward(DropoutAttrs const &attrs) {
-  OpTaskBinding b = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::DROPOUT_BWD_TASK_ID, b};
-}
-
-static DeviceSpecificDeviceStates
-    init_task_impl(TaskArgumentAccessor const &acc) {
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  Allocator allocator = acc.get_allocator();
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(FF_HANDLE);
-  auto const &attrs = acc.get_argument<DropoutAttrs>(ATTRS);
-
-  DropoutPerDeviceState per_device_state =
-      init_kernel(handle, attrs.rate, attrs.seed, output.shape, allocator);
-  return DeviceSpecificDeviceStates{
-      DeviceSpecific<DropoutPerDeviceState>::create(per_device_state)};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  auto per_device_state =
-      acc.get_argument<DropoutPerDeviceState>(PER_DEVICE_STATE);
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-
-  return profile(forward_kernel,
-                 profiling,
-                 "[Dropout] forward_time = {:.2lf}ms\n",
-                 per_device_state,
-                 input.get_float_ptr(),
-                 output.get_float_ptr());
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  auto const &attrs = acc.get_argument<DropoutAttrs>(ATTRS);
-  auto per_device_state =
-      acc.get_argument<DropoutPerDeviceState>(PER_DEVICE_STATE);
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-
-  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
-
-  return profile(backward_kernel,
-                 profiling,
-                 "[Dropout] backward_time = {:.2lf}ms\n",
-                 per_device_state,
-                 output_grad.get_float_ptr(),
-                 input_grad.get_float_ptr());
-}
-
-TaskImplFunction get_dropout_init_task_impl() {
-  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
-}
-TaskImplFunction get_dropout_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-TaskImplFunction get_dropout_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-OpTaskSignature get_dropout_init_signature() {
-  OpTaskSignature init(OpTaskType::INIT);
-
-  init.add_arg_slot<DropoutAttrs>(ATTRS);
-  init.add_unchecked_arg_slot<PerDeviceFFHandle>(FF_HANDLE);
-  init.add_output_slot(OUTPUT);
-
-  init.add_return_value<DropoutPerDeviceState>();
-
-  return init;
-}
-
-OpTaskSignature get_dropout_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_unchecked_arg_slot<DropoutPerDeviceState>(PER_DEVICE_STATE);
-  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
-
-  fwd.add_input_slot(INPUT);
-  fwd.add_output_slot(OUTPUT);
-
-  return fwd;
-}
-
-OpTaskSignature get_dropout_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_dropout_fwd_signature());
-
-  return bwd;
-}
-
-std::vector<task_id_t> get_task_ids(DropoutAttrs const &) {
-  return {task_id_t::DROPOUT_INIT_TASK_ID,
-          task_id_t::DROPOUT_FWD_TASK_ID,
-          task_id_t::DROPOUT_BWD_TASK_ID};
-}
-
-}; // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/element_binary.cc b/lib/local-execution/src/local-execution/ops/element_binary.cc
deleted file mode 100644
index ec8ed298d0..0000000000
--- a/lib/local-execution/src/local-execution/ops/element_binary.cc
+++ /dev/null
@@ -1,180 +0,0 @@
-#include "local-execution/ops/element_binary.h"
-#include "kernels/element_binary_kernels.h"
-#include "local-execution/task_signature_impl.h"
-#include "utils/hash-utils.h"
-
-namespace FlexFlow {
-
-using namespace FlexFlow::Kernels::ElementBinary;
-
-enum Slots {
-  LHS_INPUT,
-  RHS_INPUT,
-  OUTPUT,
-  PROFILING,
-  PER_DEVICE_STATE,
-  HANDLE,
-  ATTRS
-};
-
-OpTaskInvocation init(ElementBinaryAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind(LHS_INPUT, input_tensor(0));
-  binding.bind(RHS_INPUT, input_tensor(1));
-  binding.bind(OUTPUT, output_tensor(0));
-  binding.bind_arg(ATTRS, attrs);
-  binding.bind_arg(HANDLE, ff_handle());
-
-  return {task_id_t::ELEMENTBINARY_INIT_TASK_ID, binding};
-}
-
-OpTaskInvocation forward(ElementBinaryAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind(LHS_INPUT, input_tensor(0));
-  binding.bind(RHS_INPUT, input_tensor(1));
-  binding.bind(OUTPUT, output_tensor(0));
-  binding.bind_arg(ATTRS, attrs);
-  binding.bind_arg(PROFILING, profiling_settings());
-  binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_op_state<ElementBinaryPerDeviceState>());
-  binding.bind_arg(HANDLE, ff_handle());
-
-  return {task_id_t::ELEMENTBINARY_FWD_TASK_ID, binding};
-}
-
-OpTaskInvocation backward(ElementBinaryAttrs const &attrs) {
-  OpTaskBinding b = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::ELEMENTBINARY_BWD_TASK_ID, b};
-}
-
-static DeviceSpecificDeviceStates
-    init_task_impl(TaskArgumentAccessor const &acc) {
-  auto input_lhs = acc.get_tensor<Permissions::RO>(LHS_INPUT);
-  auto input_rhs = acc.get_tensor<Permissions::RO>(RHS_INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
-  auto const &attrs = acc.get_argument<ElementBinaryAttrs>(ATTRS);
-
-  ElementBinaryPerDeviceState per_device_state =
-      init_kernel(handle,
-                  attrs.type,
-                  attrs.should_broadcast_lhs,
-                  attrs.should_broadcast_rhs,
-                  input_lhs.shape,
-                  input_rhs.shape,
-                  output.shape);
-  return DeviceSpecificDeviceStates{
-      DeviceSpecific<ElementBinaryPerDeviceState>::create(per_device_state)};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto per_device_state =
-      acc.get_argument<ElementBinaryPerDeviceState>(PER_DEVICE_STATE);
-  auto const &attrs = acc.get_argument<ElementBinaryAttrs>(ATTRS);
-
-  auto input_lhs = acc.get_tensor<Permissions::RO>(LHS_INPUT);
-  auto input_rhs = acc.get_tensor<Permissions::RO>(RHS_INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
-
-  return profile(forward_kernel,
-                 profiling,
-                 "[ElementBinary] forward_time = {:.2lf}ms\n",
-                 per_device_state,
-                 input_lhs.get_float_ptr(),
-                 input_rhs.get_float_ptr(),
-                 output.get_float_ptr(),
-                 attrs.type,
-                 attrs.should_broadcast_lhs,
-                 handle);
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  auto per_device_state =
-      acc.get_argument<ElementBinaryPerDeviceState>(PER_DEVICE_STATE);
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto const &attrs = acc.get_argument<ElementBinaryAttrs>(ATTRS);
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
-
-  auto input_lhs = acc.get_tensor<Permissions::RO>(LHS_INPUT);
-  auto input_rhs = acc.get_tensor<Permissions::RO>(RHS_INPUT);
-
-  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
-  auto input_lhs_grad = acc.get_tensor_grad<Permissions::RW>(LHS_INPUT);
-  auto input_rhs_grad = acc.get_tensor_grad<Permissions::RW>(RHS_INPUT);
-
-  return profile(backward_kernel,
-                 profiling,
-                 "[ElementBinary] backward_time = {:.2lf}ms\n",
-                 per_device_state,
-                 output_grad.get_float_ptr(),
-                 input_lhs.get_float_ptr(),
-                 input_rhs.get_float_ptr(),
-                 input_lhs_grad.get_float_ptr(),
-                 input_rhs_grad.get_float_ptr(),
-                 attrs.type,
-                 attrs.should_broadcast_lhs,
-                 attrs.should_broadcast_rhs,
-                 handle);
-}
-
-TaskImplFunction get_element_binary_init_task_impl() {
-  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
-}
-
-TaskImplFunction get_element_binary_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-
-TaskImplFunction get_element_binary_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-OpTaskSignature get_element_binary_init_signature() {
-  OpTaskSignature init(OpTaskType::INIT);
-
-  init.add_input_slot(LHS_INPUT);
-  init.add_input_slot(RHS_INPUT);
-  init.add_output_slot(OUTPUT);
-  init.add_arg_slot<BatchMatmulAttrs>(ATTRS);
-  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
-
-  init.add_return_value<ElementBinaryPerDeviceState>();
-
-  return init;
-}
-
-OpTaskSignature get_element_binary_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
-  fwd.add_unchecked_arg_slot<ElementBinaryPerDeviceState>(PER_DEVICE_STATE);
-  fwd.add_arg_slot<ElementBinaryAttrs>(ATTRS);
-  fwd.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
-
-  fwd.add_input_slot(LHS_INPUT);
-  fwd.add_input_slot(RHS_INPUT);
-  fwd.add_output_slot(OUTPUT);
-
-  return fwd;
-}
-
-OpTaskSignature get_element_binary_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_element_binary_fwd_signature());
-
-  return bwd;
-}
-
-std::vector<task_id_t> get_task_ids(ElementBinaryAttrs const &) {
-  return {task_id_t::ELEMENTBINARY_INIT_TASK_ID,
-          task_id_t::ELEMENTBINARY_FWD_TASK_ID,
-          task_id_t::ELEMENTBINARY_BWD_TASK_ID};
-}
-
-}; // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/element_unary.cc b/lib/local-execution/src/local-execution/ops/element_unary.cc
deleted file mode 100644
index 106c0760cd..0000000000
--- a/lib/local-execution/src/local-execution/ops/element_unary.cc
+++ /dev/null
@@ -1,165 +0,0 @@
-#include "local-execution/ops/element_unary.h"
-#include "kernels/element_unary_kernels.h"
-#include "op-attrs/parallel_tensor_shape.h"
-#include "utils/hash-utils.h"
-
-namespace FlexFlow {
-
-// declare Legion names
-
-using namespace FlexFlow::Kernels::ElementUnary;
-
-enum Slots {
-  INPUT,
-  INPUT_SHAPE,
-  OUTPUT,
-  ATTRS,
-  HANDLE,
-  PROFILING,
-  PER_DEVICE_STATE
-};
-
-/* ElementUnary */
-OpTaskInvocation init(ElementUnaryAttrs const &attrs) {
-  OpTaskBinding b;
-
-  b.bind_arg(ATTRS, attrs);
-  b.bind_arg(INPUT_SHAPE, input_parallel_tensor_shape(0));
-
-  return {task_id_t::ELEMENTUNARY_INIT_TASK_ID, b};
-}
-
-OpTaskInvocation forward(ElementUnaryAttrs const &attrs) {
-  OpTaskBinding b;
-
-  b.bind(INPUT, input_tensor(0));
-  b.bind(OUTPUT, output_tensor(0));
-  b.bind_arg(ATTRS, attrs);
-
-  b.bind_arg(HANDLE, ff_handle());
-  b.bind_arg(PROFILING, profiling_settings());
-  b.bind_arg(PER_DEVICE_STATE,
-             per_device_op_state<ElementUnaryPerDeviceState>());
-
-  return {task_id_t::ELEMENTUNARY_FWD_TASK_ID, b};
-}
-
-OpTaskInvocation backward(ElementUnaryAttrs const &attrs) {
-  OpTaskBinding b = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::ELEMENTUNARY_BWD_TASK_ID, b};
-}
-
-static DeviceSpecificDeviceStates
-    init_task_impl(TaskArgumentAccessor const &acc) {
-
-  auto attrs = acc.get_argument<ElementUnaryAttrs>(ATTRS);
-
-  ParallelTensorShape input_shape =
-      acc.get_argument<ParallelTensorShape>(INPUT_SHAPE);
-
-  ParallelTensorShape output_shape =
-      throw_if_unexpected(get_output_shape(attrs, input_shape));
-  ElementUnaryPerDeviceState per_device_state =
-      init_kernel(ArrayShape{get_piece_shape(input_shape)},
-                  ArrayShape{get_piece_shape(output_shape)},
-                  attrs);
-
-  return DeviceSpecificDeviceStates{
-      DeviceSpecific<ElementUnaryPerDeviceState>::create(per_device_state)};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  auto attrs = acc.get_argument<ElementUnaryAttrs>(ATTRS);
-
-  auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
-
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto per_device_state =
-      acc.get_argument<ElementUnaryPerDeviceState>(PER_DEVICE_STATE);
-
-  return profile(forward_kernel,
-                 profiling,
-                 "[ElementUnary] forward_time = {:.2lf}ms\n",
-                 per_device_state,
-                 attrs,
-                 handle,
-                 input,
-                 output);
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
-  auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
-
-  auto const &attrs = acc.get_argument<ElementUnaryAttrs>(ATTRS);
-  auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
-
-  auto per_device_state =
-      acc.get_argument<ElementUnaryPerDeviceState>(PER_DEVICE_STATE);
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-
-  return profile(backward_kernel,
-                 profiling,
-                 "[ElementUnary] backward_time = {:.2lf}ms\n",
-                 per_device_state,
-                 attrs,
-                 handle,
-                 input,
-                 input_grad,
-                 output,
-                 output_grad);
-}
-
-TaskImplFunction get_element_unary_init_task_impl() {
-  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
-}
-TaskImplFunction get_element_unary_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-TaskImplFunction get_element_unary_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-OpTaskSignature get_element_unary_init_signature() {
-  OpTaskSignature init(OpTaskType::INIT);
-
-  init.add_arg_slot<ParallelTensorShape>(INPUT_SHAPE);
-  init.add_arg_slot<ElementUnaryAttrs>(ATTRS);
-  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
-
-  init.add_return_value<ElementUnaryPerDeviceState>();
-
-  return init;
-}
-
-OpTaskSignature get_element_unary_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_input_slot(INPUT);
-  fwd.add_output_slot(OUTPUT);
-
-  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
-  fwd.add_unchecked_arg_slot<ElementUnaryPerDeviceState>(PER_DEVICE_STATE);
-
-  return fwd;
-}
-
-OpTaskSignature get_element_unary_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_element_unary_fwd_signature());
-
-  return bwd;
-}
-
-std::vector<task_id_t> get_task_ids(ElementUnaryAttrs const &) {
-  return {task_id_t::ELEMENTUNARY_INIT_TASK_ID,
-          task_id_t::ELEMENTUNARY_FWD_TASK_ID,
-          task_id_t::ELEMENTUNARY_BWD_TASK_ID};
-}
-
-} // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/flat.cc b/lib/local-execution/src/local-execution/ops/flat.cc
deleted file mode 100644
index 87295c2297..0000000000
--- a/lib/local-execution/src/local-execution/ops/flat.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-#include "local-execution/ops/flat.h"
-#include "kernels/flat_kernels.h"
-
-namespace FlexFlow {
-
-using namespace FlexFlow::Kernels::Flat;
-
-enum SLOTS { INPUT, OUTPUT, HANDLE, PROFILING };
-
-OpTaskInvocation forward(FlatAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-
-  binding.bind_arg(PROFILING, profiling_settings());
-  return {task_id_t::FLAT_FWD_TASK_ID, binding};
-}
-
-OpTaskInvocation backward(FlatAttrs const &attrs) {
-  OpTaskBinding b = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::FLAT_BWD_TASK_ID, b};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-
-  return profile(forward_kernel,
-                 profiling,
-                 "[Flat] forward_time = {:.2lf}ms\n",
-                 input,
-                 output.get_float_ptr());
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
-
-  return profile(backward_kernel,
-                 profiling,
-                 "[Flat] backward_time = {:.2lf}ms\n",
-                 input,
-                 input_grad.get_float_ptr(),
-                 output_grad.get_float_ptr());
-}
-
-TaskImplFunction get_flat_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-TaskImplFunction get_flat_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-OpTaskSignature get_flat_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
-  fwd.add_input_slot(INPUT);
-  fwd.add_output_slot(OUTPUT);
-
-  return fwd;
-}
-
-OpTaskSignature get_flat_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_flat_fwd_signature());
-
-  return bwd;
-}
-
-std::vector<task_id_t> get_task_ids(FlatAttrs const &) {
-  return {task_id_t::FLAT_FWD_TASK_ID, task_id_t::FLAT_BWD_TASK_ID};
-}
-
-}; // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/gather.cc b/lib/local-execution/src/local-execution/ops/gather.cc
deleted file mode 100644
index 7e4b99a557..0000000000
--- a/lib/local-execution/src/local-execution/ops/gather.cc
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "local-execution/ops/gather.h"
-#include "kernels/gather_kernels.h"
-#include "utils/nonnegative_int/nonnegative_range.h"
-#include <optional>
-
-namespace FlexFlow {
-
-using namespace FlexFlow::Kernels::Gather;
-
-enum Slots { INPUT, OUTPUT, INDEX, ATTRS, HANDLE, PROFILING, PER_DEVICE_STATE };
-
-OpTaskInvocation init(GatherAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(INDEX, input_tensor(1));
-  binding.bind(OUTPUT, output_tensor(0));
-  binding.bind_arg(ATTRS, attrs);
-  binding.bind_arg(HANDLE, ff_handle());
-
-  return {task_id_t::GATHER_INIT_TASK_ID, binding};
-}
-
-OpTaskInvocation forward(GatherAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(ATTRS, attrs);
-  binding.bind_arg(PROFILING, profiling_settings());
-  binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_op_state<GatherPerDeviceState>());
-
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-  binding.bind(INDEX, weight_tensor(0));
-
-  return {task_id_t::GATHER_FWD_TASK_ID, binding};
-}
-
-OpTaskInvocation backward(GatherAttrs const &attrs) {
-  OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::GATHER_BWD_TASK_ID, binding};
-}
-
-static DeviceSpecificDeviceStates
-    init_task_impl(TaskArgumentAccessor const &acc) {
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto index = acc.get_tensor<Permissions::RO>(INDEX);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
-  auto const &attrs = acc.get_argument<GatherAttrs>(ATTRS);
-  legion_dim_t legion_dim =
-      legion_dim_from_ff_dim(attrs.dim, input.shape.num_dims());
-
-  assert(input.shape.get_dim() == index.shape.get_dim());
-  assert(output.shape.get_dim() == index.shape.get_dim());
-
-  for (nonnegative_int i : nonnegative_range(input.shape.get_dim())) {
-    assert(index.shape.at(legion_dim_t{i}) == output.shape.at(legion_dim_t{i}));
-    if (i != legion_dim.value) {
-      assert(input.shape.at(legion_dim_t{i}) ==
-             index.shape.at(legion_dim_t{i}));
-    }
-  }
-
-  GatherPerDeviceState per_device_state = {handle, legion_dim};
-  return DeviceSpecificDeviceStates{
-      DeviceSpecific<GatherPerDeviceState>::create(per_device_state)};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto per_device_state =
-      acc.get_argument<GatherPerDeviceState>(PER_DEVICE_STATE);
-
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto index = acc.get_tensor<Permissions::RO>(INDEX);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-
-  return profile(forward_kernel,
-                 profiling,
-                 "[Gather] forward_time = {:.2lf}ms\n",
-                 per_device_state,
-                 input,
-                 index,
-                 output);
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto per_device_state =
-      acc.get_argument<GatherPerDeviceState>(PER_DEVICE_STATE);
-
-  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
-  auto index = acc.get_tensor<Permissions::RO>(INDEX);
-  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
-
-  return profile(backward_kernel,
-                 profiling,
-                 "[Gather] backward_time = {:.2lf}ms\n",
-                 per_device_state,
-                 output_grad,
-                 index,
-                 input_grad);
-}
-
-TaskImplFunction get_gather_init_task_impl() {
-  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
-}
-TaskImplFunction get_gather_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-TaskImplFunction get_gather_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-OpTaskSignature get_gather_init_signature() {
-  OpTaskSignature init(OpTaskType::INIT);
-
-  init.add_input_slot(INPUT);
-  init.add_input_slot(INDEX);
-  init.add_output_slot(OUTPUT);
-
-  init.add_arg_slot<GatherAttrs>(ATTRS);
-  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
-
-  init.add_return_value<GatherPerDeviceState>();
-
-  return init;
-}
-
-OpTaskSignature get_gather_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_arg_slot<bool>(PROFILING);
-  fwd.add_arg_slot<GatherAttrs>(ATTRS);
-
-  fwd.add_input_slot(INPUT);
-  fwd.add_output_slot(OUTPUT);
-  fwd.add_weight_slot(INDEX);
-
-  return fwd;
-}
-
-OpTaskSignature get_gather_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_gather_fwd_signature());
-
-  return bwd;
-}
-
-std::vector<task_id_t> get_task_ids(GatherAttrs const &) {
-  return {task_id_t::GATHER_INIT_TASK_ID,
-          task_id_t::GATHER_FWD_TASK_ID,
-          task_id_t::GATHER_BWD_TASK_ID};
-}
-
-}; // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/input.cc b/lib/local-execution/src/local-execution/ops/input.cc
deleted file mode 100644
index d7a3888220..0000000000
--- a/lib/local-execution/src/local-execution/ops/input.cc
+++ /dev/null
@@ -1,9 +0,0 @@
-#include "local-execution/ops/input.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(InputAttrs const &attrs) {
-  return {};
-}
-
-}; // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/layer_norm.cc b/lib/local-execution/src/local-execution/ops/layer_norm.cc
deleted file mode 100644
index d2fc930375..0000000000
--- a/lib/local-execution/src/local-execution/ops/layer_norm.cc
+++ /dev/null
@@ -1,190 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "local-execution/ops/layer_norm.h"
-#include "kernels/layer_norm_kernels.h"
-#include "op-attrs/ops/layer_norm.h"
-#include "op-attrs/parallel_tensor_shape.h"
-#include "utils/exception.h"
-#include "utils/hash-utils.h"
-#include "utils/nonnegative_int/nonnegative_range.h"
-#include <type_traits>
-
-namespace FlexFlow {
-
-using namespace FlexFlow::Kernels::LayerNorm;
-
-enum Slots {
-  PROFILING,
-  INPUT,
-  OUTPUT,
-  GAMMA,
-  BETA,
-  PER_DEVICE_STATE,
-  ATTRS,
-  HANDLE
-};
-
-OpTaskInvocation init(LayerNormAttrs const &attrs) {
-  OpTaskBinding b;
-
-  b.bind(INPUT, input_tensor(0));
-
-  b.bind_arg(HANDLE, ff_handle());
-  b.bind_arg(ATTRS, attrs);
-
-  return {task_id_t::LAYERNORM_INIT_TASK_ID, b};
-}
-
-OpTaskInvocation forward(LayerNormAttrs const &attrs) {
-  OpTaskBinding b;
-
-  b.bind(INPUT, input_tensor(0));
-  b.bind(OUTPUT, output_tensor(0));
-  b.bind(GAMMA, weight_tensor(0)); // todo, this may have some problem
-  b.bind(BETA, weight_tensor(1));  // how to get gmmam and beta
-  b.bind_arg(PROFILING, profiling_settings());
-  b.bind_arg(PER_DEVICE_STATE, per_device_op_state<LayerNormPerDeviceState>());
-
-  return {task_id_t::LAYERNORM_FWD_TASK_ID, b};
-}
-
-OpTaskInvocation backward(LayerNormAttrs const &attrs) {
-  OpTaskBinding b = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::LAYERNORM_BWD_TASK_ID, b};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  auto gamma = acc.get_tensor<Permissions::RW>(GAMMA);
-  auto beta = acc.get_tensor<Permissions::RW>(BETA);
-
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto &state = acc.get_argument<LayerNormPerDeviceState>(PER_DEVICE_STATE);
-
-  return profile(forward_kernel,
-                 profiling,
-                 "[LayerNorm] forward time = {:.2lf}ms\n",
-                 state,
-                 input,
-                 output,
-                 gamma,
-                 beta);
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto gamma = acc.get_tensor<Permissions::RO>(GAMMA);
-
-  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
-  auto gamma_grad = acc.get_tensor_grad<Permissions::RW>(GAMMA);
-  auto beta_grad = acc.get_tensor_grad<Permissions::RW>(BETA);
-  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
-
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto &state = acc.get_argument<LayerNormPerDeviceState>(PER_DEVICE_STATE);
-
-  return profile(backward_kernel,
-                 profiling,
-                 "[LayerNorm] backward time = {:.2lf}ms\n",
-                 state,
-                 output_grad,
-                 input,
-                 input_grad,
-                 gamma,
-                 gamma_grad,
-                 beta_grad);
-}
-
-static DeviceSpecificDeviceStates
-    init_task_impl(TaskArgumentAccessor const &acc) {
-  auto const &attrs = acc.get_argument<LayerNormAttrs>(ATTRS);
-  Allocator allocator = acc.get_allocator();
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
-
-  nonnegative_int M = 1_n;
-  for (int i = 0; i < attrs.axes.size(); i++) {
-    legion_dim_t legion_dim =
-        legion_dim_from_ff_dim(attrs.axes[i], input.shape.num_dims());
-    M *= input.shape.at(legion_dim);
-  }
-  nonnegative_int num_replicas = 1_n;
-  for (nonnegative_int i : nonnegative_range(input.shape.num_dims())) {
-    num_replicas *= input.shape.at(legion_dim_t{i});
-  }
-  nonnegative_int effective_num_elements = M;
-  nonnegative_int effective_batch_size = input.shape.get_volume() / M;
-
-  LayerNormPerDeviceState per_device_state =
-      init_kernel(handle,
-                  allocator,
-                  attrs.elementwise_affine,
-                  effective_batch_size.unwrap_nonnegative(),
-                  effective_num_elements.unwrap_nonnegative(),
-                  attrs.eps);
-  return DeviceSpecificDeviceStates{
-      DeviceSpecific<LayerNormPerDeviceState>::create(per_device_state)};
-}
-
-TaskImplFunction get_layer_norm_init_task_impl() {
-  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
-}
-TaskImplFunction get_layer_norm_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-TaskImplFunction get_layer_norm_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-OpTaskSignature get_layer_norm_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_input_slot(INPUT);
-  fwd.add_output_slot(OUTPUT);
-  fwd.add_weight_slot(GAMMA);
-  fwd.add_weight_slot(BETA);
-
-  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
-  fwd.add_unchecked_arg_slot<LayerNormPerDeviceState>(PER_DEVICE_STATE);
-  return fwd;
-}
-
-OpTaskSignature get_layer_norm_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_layer_norm_fwd_signature());
-  return bwd;
-}
-
-OpTaskSignature get_layer_norm_init_signature() {
-  OpTaskSignature init(OpTaskType::INIT);
-
-  init.add_input_slot(INPUT);
-  init.add_arg_slot<LayerNormAttrs>(ATTRS);
-  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
-
-  init.add_return_value<LayerNormPerDeviceState>();
-  return init;
-}
-
-std::vector<task_id_t> get_task_ids(LayerNormAttrs const &) {
-  return {task_id_t::LAYERNORM_INIT_TASK_ID,
-          task_id_t::LAYERNORM_FWD_TASK_ID,
-          task_id_t::LAYERNORM_BWD_TASK_ID};
-}
-
-} // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/linear.cc b/lib/local-execution/src/local-execution/ops/linear.cc
deleted file mode 100644
index 768293b32f..0000000000
--- a/lib/local-execution/src/local-execution/ops/linear.cc
+++ /dev/null
@@ -1,210 +0,0 @@
-#include "local-execution/ops/linear.h"
-#include "kernels/linear_kernels.h"
-#include "local-execution/task_argument_accessor.h"
-#include "op-attrs/ff_dim_t.h"
-#include "utils/exception.h"
-#include "utils/hash-utils.h"
-
-namespace FlexFlow {
-
-using namespace FlexFlow::Kernels::Linear;
-
-enum slots {
-  INPUT,
-  OUTPUT,
-  WEIGHT,
-  BIAS,
-  ATTRS,
-  PROFILING,
-  HANDLE,
-  PER_DEVICE_STATE
-};
-
-OpTaskInvocation init(LinearAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(HANDLE, ff_handle());
-  binding.bind_arg(ATTRS, attrs);
-
-  binding.bind(INPUT, input_tensor(0));   // input
-  binding.bind(WEIGHT, weight_tensor(0)); // weight
-  binding.bind(OUTPUT, output_tensor(0)); // output
-
-  return {task_id_t::LINEAR_INIT_TASK_ID, binding};
-}
-
-OpTaskInvocation forward(LinearAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind(INPUT, input_tensor(0));   // input
-  binding.bind(WEIGHT, weight_tensor(0)); // weight
-  binding.bind(OUTPUT, output_tensor(0)); // output
-  if (attrs.use_bias) {
-    binding.bind(BIAS, weight_tensor(1)); // bias
-  }
-
-  binding.bind_arg(PROFILING, profiling_settings());
-  binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_op_state<LinearPerDeviceState>());
-  binding.bind_arg(ATTRS, attrs);
-
-  return {task_id_t::LINEAR_FWD_TASK_ID, binding};
-}
-
-OpTaskInvocation backward(LinearAttrs const &attrs) {
-  OpTaskBinding b = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::LINEAR_BWD_TASK_ID, b};
-}
-
-static DeviceSpecificDeviceStates
-    init_task_impl(TaskArgumentAccessor const &acc) {
-  auto const &attrs = acc.get_argument<LinearAttrs>(ATTRS);
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
-
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  nonnegative_int out_dim = output.shape.at(ff_dim_t{0_n});
-  nonnegative_int batch_size = output.shape.at(ff_dim_t{1_n});
-
-  float *one_ptr;
-
-  LinearPerDeviceState per_device_state =
-      init_kernel(handle,
-                  one_ptr,
-                  attrs.activation,
-                  attrs.regularizer,
-                  attrs.use_bias,
-                  input.data_type,
-                  weight.data_type,
-                  output.data_type,
-                  batch_size.unwrap_nonnegative(),
-                  attrs.out_channels.unwrap_nonnegative());
-  return DeviceSpecificDeviceStates{
-      DeviceSpecific<LinearPerDeviceState>::create(per_device_state)};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-
-  auto per_device_state =
-      acc.get_argument<LinearPerDeviceState>(PER_DEVICE_STATE);
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto attrs = acc.get_argument<LinearAttrs>(ATTRS);
-
-  nonnegative_int in_dim = input.shape.at(ff_dim_t{0_n});
-  nonnegative_int out_dim = output.shape.at(ff_dim_t{0_n});
-  nonnegative_int batch_size = output.shape.get_volume() / out_dim;
-
-  float const *bias_ptr = NULL;
-  if (attrs.use_bias) {
-    auto bias = acc.get_tensor<Permissions::RO>(BIAS);
-    bias_ptr = bias.get_float_ptr();
-  }
-
-  return profile(forward_kernel,
-                 profiling,
-                 "[Linear] forward_time = {:.2lf}ms\n",
-                 per_device_state,
-                 input.get_float_ptr(),
-                 output.get_float_ptr(),
-                 weight.get_float_ptr(),
-                 bias_ptr,
-                 in_dim.unwrap_nonnegative(),
-                 out_dim.unwrap_nonnegative(),
-                 batch_size.unwrap_nonnegative());
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-
-  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
-  auto weight_grad = acc.get_tensor_grad<Permissions::RW>(WEIGHT);
-  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
-  auto per_device_state =
-      acc.get_argument<LinearPerDeviceState>(PER_DEVICE_STATE);
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto attrs = acc.get_argument<LinearAttrs>(ATTRS);
-
-  float const *bias_ptr = NULL;
-  if (attrs.use_bias) {
-    auto bias = acc.get_tensor<Permissions::RO>(BIAS);
-    bias_ptr = bias.get_float_ptr();
-  }
-
-  nonnegative_int in_dim = input.shape.at(ff_dim_t{0_n});
-  nonnegative_int out_dim = output.shape.at(ff_dim_t{0_n});
-  nonnegative_int batch_size = output.shape.get_volume() / out_dim;
-
-  return profile(backward_kernel,
-                 profiling,
-                 "[Linear] backward_time = {:.2lf}ms\n",
-                 per_device_state,
-                 (void *)input.get_float_ptr(),
-                 (void *)input_grad.get_float_ptr(),
-                 (void *)output.get_float_ptr(),
-                 (void *)output_grad.get_float_ptr(),
-                 (void *)weight.get_float_ptr(),
-                 (void *)weight_grad.get_float_ptr(),
-                 (void *)bias_ptr,
-                 in_dim.unwrap_nonnegative(),
-                 out_dim.unwrap_nonnegative(),
-                 batch_size.unwrap_nonnegative());
-}
-
-TaskImplFunction get_linear_init_task_impl() {
-  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
-}
-TaskImplFunction get_linear_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-TaskImplFunction get_linear_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-OpTaskSignature get_linear_init_signature() {
-  OpTaskSignature init(OpTaskType::INIT);
-
-  init.add_input_slot(INPUT);
-  init.add_weight_slot(WEIGHT);
-  init.add_output_slot(OUTPUT);
-
-  init.add_arg_slot<LinearAttrs>(ATTRS);
-  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
-
-  init.add_return_value<LinearPerDeviceState>();
-  return init;
-}
-
-OpTaskSignature get_linear_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_input_slot(INPUT);
-  fwd.add_weight_slot(WEIGHT);
-  fwd.add_optional_weight_slot(BIAS);
-  fwd.add_output_slot(OUTPUT);
-
-  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
-  fwd.add_arg_slot<LinearAttrs>(ATTRS);
-  fwd.add_unchecked_arg_slot<LinearPerDeviceState>(PER_DEVICE_STATE);
-  return fwd;
-}
-
-OpTaskSignature get_linear_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_linear_fwd_signature());
-  return bwd;
-}
-
-std::vector<task_id_t> get_task_ids(LinearAttrs const &) {
-  return {task_id_t::LINEAR_INIT_TASK_ID,
-          task_id_t::LINEAR_FWD_TASK_ID,
-          task_id_t::LINEAR_BWD_TASK_ID};
-}
-
-}; // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/noop.cc b/lib/local-execution/src/local-execution/ops/noop.cc
deleted file mode 100644
index 7357806880..0000000000
--- a/lib/local-execution/src/local-execution/ops/noop.cc
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "local-execution/ops/noop.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(NoopAttrs const &attrs) {
-  return {};
-}
-
-}; // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/pool_2d.cc b/lib/local-execution/src/local-execution/ops/pool_2d.cc
deleted file mode 100644
index 8622732a4d..0000000000
--- a/lib/local-execution/src/local-execution/ops/pool_2d.cc
+++ /dev/null
@@ -1,176 +0,0 @@
-#include "local-execution/ops/pool_2d.h"
-#include "kernels/pool_2d_kernels.h"
-
-#include "op-attrs/ops/pool_2d.h"
-#include "utils/exception.h"
-#include "utils/hash-utils.h"
-
-using namespace FlexFlow::Kernels::Pool2D;
-
-namespace FlexFlow {
-
-enum Slots { INPUT, OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE, HANDLE };
-
-OpTaskInvocation init(Pool2DAttrs const &attrs) {
-  OpTaskBinding binding;
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-  binding.bind_arg(ATTRS, attrs);
-  binding.bind_arg(HANDLE, ff_handle());
-
-  return {task_id_t::POOL2D_INIT_TASK_ID, binding};
-}
-
-static nonnegative_int calculate_padding(nonnegative_int output_size,
-                                         nonnegative_int stride,
-                                         nonnegative_int kernel_size,
-                                         nonnegative_int input_size) {
-  int o = output_size.unwrap_nonnegative();
-  int s = stride.unwrap_nonnegative();
-  int k = kernel_size.unwrap_nonnegative();
-  int i = kernel_size.unwrap_nonnegative();
-
-  return nonnegative_int{
-      ((o - 1) * s + k - i + 1) / 2,
-  };
-}
-
-static DeviceSpecificDeviceStates
-    init_task_impl(TaskArgumentAccessor const &acc) {
-  auto const &attrs = acc.get_argument<Pool2DAttrs>(ATTRS);
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
-
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-
-  nonnegative_int input_w = input.shape.at(ff_dim_t{0_n});
-  nonnegative_int input_h = input.shape.at(ff_dim_t{1_n});
-  nonnegative_int input_c = input.shape.at(ff_dim_t{2_n});
-  nonnegative_int input_n = input.shape.at(ff_dim_t{3_n});
-  nonnegative_int output_w = output.shape.at(ff_dim_t{0_n});
-  nonnegative_int output_h = output.shape.at(ff_dim_t{1_n});
-  nonnegative_int output_c = output.shape.at(ff_dim_t{2_n});
-  nonnegative_int output_n = output.shape.at(ff_dim_t{3_n});
-
-  Pool2DPerDeviceState per_device_state =
-      init_kernel(handle,
-                  attrs.activation,
-                  input_w.unwrap_nonnegative(),
-                  input_h.unwrap_nonnegative(),
-                  input_c.unwrap_nonnegative(),
-                  input_n.unwrap_nonnegative(),
-                  output_w.unwrap_nonnegative(),
-                  output_h.unwrap_nonnegative(),
-                  output_c.unwrap_nonnegative(),
-                  output_n.unwrap_nonnegative(),
-                  attrs.padding_h.unwrap_nonnegative(),
-                  attrs.padding_w.unwrap_nonnegative(),
-                  attrs.kernel_h.unwrap_nonnegative(),
-                  attrs.kernel_w.unwrap_nonnegative(),
-                  attrs.stride_h.unwrap_nonnegative(),
-                  attrs.stride_w.unwrap_nonnegative(),
-                  attrs.pool_type);
-
-  return DeviceSpecificDeviceStates{
-      DeviceSpecific<Pool2DPerDeviceState>::create(per_device_state)};
-}
-
-OpTaskInvocation forward(Pool2DAttrs const &attrs) {
-  OpTaskBinding binding;
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-
-  binding.bind_arg(PROFILING, profiling_settings());
-  binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_op_state<Pool2DPerDeviceState>());
-
-  return {task_id_t::POOL2D_FWD_TASK_ID, binding};
-}
-
-OpTaskInvocation backward(Pool2DAttrs const &attrs) {
-  OpTaskBinding b = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::POOL2D_BWD_TASK_ID, b};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  Pool2DPerDeviceState state =
-      acc.get_argument<Pool2DPerDeviceState>(PER_DEVICE_STATE);
-
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-
-  return profile(forward_kernel,
-                 profiling,
-                 "[Pool2D] forward_time = {:.2lf}ms\n",
-                 state,
-                 input.get_float_ptr(),
-                 output.get_float_ptr());
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  Pool2DPerDeviceState state =
-      acc.get_argument<Pool2DPerDeviceState>(PER_DEVICE_STATE);
-
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto input_grad = acc.get_tensor<Permissions::RW>(INPUT);
-  auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
-  auto output_grad = acc.get_tensor<Permissions::RO>(OUTPUT);
-
-  return profile(backward_kernel,
-                 profiling,
-                 "[Pool2D] backward_time = {:.2lf}ms\n",
-                 state,
-                 input.get_float_ptr(),
-                 input_grad.get_float_ptr(),
-                 output.get_float_ptr(),
-                 output_grad.get_float_ptr());
-}
-
-TaskImplFunction get_pool_2d_init_task_impl() {
-  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
-}
-TaskImplFunction get_pool_2d_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-TaskImplFunction get_pool_2d_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-OpTaskSignature get_pool_2d_init_signature() {
-  OpTaskSignature init(OpTaskType::INIT);
-
-  init.add_input_slot(INPUT);
-  init.add_output_slot(OUTPUT);
-
-  init.add_arg_slot<Pool2DAttrs>(ATTRS);
-  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
-
-  init.add_return_value<FlexFlow::Pool2DPerDeviceState>();
-  return init;
-}
-OpTaskSignature get_pool_2d_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_input_slot(INPUT);
-  fwd.add_output_slot(OUTPUT);
-  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
-
-  fwd.add_unchecked_arg_slot<Pool2DPerDeviceState>(PER_DEVICE_STATE);
-  return fwd;
-}
-OpTaskSignature get_pool_2d_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_pool_2d_fwd_signature());
-  return bwd;
-}
-
-std::vector<task_id_t> get_task_ids(Pool2DAttrs const &) {
-  return {task_id_t::POOL2D_INIT_TASK_ID,
-          task_id_t::POOL2D_FWD_TASK_ID,
-          task_id_t::POOL2D_BWD_TASK_ID};
-}
-
-}; // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/reduce.cc b/lib/local-execution/src/local-execution/ops/reduce.cc
deleted file mode 100644
index bc4b5343c2..0000000000
--- a/lib/local-execution/src/local-execution/ops/reduce.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-#include "local-execution/ops/reduce.h"
-#include "kernels/reduce_kernels.h"
-
-#include "utils/exception.h"
-#include "utils/hash-utils.h"
-#include "utils/type_traits_core.h"
-
-namespace FlexFlow {
-
-using namespace FlexFlow::Kernels::Reduce;
-
-enum Slots {
-  INPUT,
-  OUTPUT,
-  ATTRS,
-  PROFILING,
-  REDUCE,
-  PER_DEVICE_STATE,
-  HANDLE
-};
-
-OpTaskInvocation init(ReduceAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(HANDLE, ff_handle());
-  binding.bind_arg(ATTRS, attrs);
-
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-
-  return {task_id_t::REDUCE_INIT_TASK_ID, binding};
-}
-
-static DeviceSpecificDeviceStates
-    init_task_impl(TaskArgumentAccessor const &acc) {
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
-  auto attrs = acc.get_argument<ReduceAttrs>(ATTRS);
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-
-  OperatorType op_type = attrs.op_type;
-
-  nonnegative_int reduction_size =
-      input.shape.get_volume() / output.shape.get_volume();
-  ReducePerDeviceState per_device_state =
-      init_kernel(handle,
-                  op_type,
-                  reduction_size.unwrap_nonnegative(),
-                  input.shape,
-                  output.shape);
-  return DeviceSpecificDeviceStates{
-      DeviceSpecific<ReducePerDeviceState>::create(per_device_state)};
-}
-
-// Note: forward_kernel only needs ReducePerDeviceState, input, output
-OpTaskInvocation forward(ReduceAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_op_state<ReducePerDeviceState>());
-  binding.bind_arg(PROFILING, profiling_settings());
-
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-
-  return {task_id_t::REDUCE_FWD_TASK_ID, binding};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  auto per_device_state =
-      acc.get_argument<ReducePerDeviceState>(PER_DEVICE_STATE);
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-
-  return profile(forward_kernel,
-                 profiling,
-                 "[Reduce] forward_time = {:.2lf}ms\n",
-                 per_device_state,
-                 input.get_float_ptr(),
-                 output.get_float_ptr());
-}
-
-OpTaskInvocation backward(ReduceAttrs const &attrs) {
-  OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::REDUCE_BWD_TASK_ID, binding};
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  auto per_device_state =
-      acc.get_argument<ReducePerDeviceState>(PER_DEVICE_STATE);
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-
-  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
-
-  return profile(backward_kernel,
-                 profiling,
-                 "[Reduce] backward_time = {:.2lf}ms\n",
-                 per_device_state,
-                 output_grad.get_float_ptr(),
-                 input_grad.get_float_ptr());
-}
-
-TaskImplFunction get_reduce_init_task_impl() {
-  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
-}
-TaskImplFunction get_reduce_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-TaskImplFunction get_reduce_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-OpTaskSignature get_reduce_init_signature() {
-  OpTaskSignature init(OpTaskType::INIT);
-
-  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
-  init.add_arg_slot<ReduceAttrs>(ATTRS);
-
-  init.add_return_value<ReducePerDeviceState>();
-  return init;
-}
-OpTaskSignature get_reduce_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_unchecked_arg_slot<ReducePerDeviceState>(PER_DEVICE_STATE);
-  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
-
-  fwd.add_input_slot(INPUT);
-  fwd.add_output_slot(OUTPUT);
-  return fwd;
-}
-OpTaskSignature get_reduce_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_reduce_fwd_signature());
-  return bwd;
-}
-
-std::vector<task_id_t> get_task_ids(ReduceAttrs const &) {
-  return {task_id_t::REDUCE_INIT_TASK_ID,
-          task_id_t::REDUCE_FWD_TASK_ID,
-          task_id_t::REDUCE_BWD_TASK_ID};
-}
-
-}; // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/reduction.cc b/lib/local-execution/src/local-execution/ops/reduction.cc
deleted file mode 100644
index 5e90b30fac..0000000000
--- a/lib/local-execution/src/local-execution/ops/reduction.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "local-execution/ops/reduction.h"
-#include "kernels/reduction_kernels.h"
-#include "utils/exception.h"
-#include "utils/hash-utils.h"
-
-namespace FlexFlow {
-
-using namespace FlexFlow::Kernels::Reduction;
-
-enum Slots { INPUT, OUTPUT, ATTRS, PROFILING };
-
-OpTaskInvocation forward(ReductionAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(PROFILING, profiling_settings());
-  binding.bind_arg(ATTRS, attrs);
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-
-  return {task_id_t::REDUCTION_FWD_TASK_ID, binding};
-}
-
-OpTaskInvocation backward(ReductionAttrs const &attrs) {
-  OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::REDUCTION_BWD_TASK_ID, binding};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling_settings =
-      acc.get_argument<ProfilingSettings>(PROFILING);
-
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  auto attrs = acc.get_argument<ReductionAttrs>(ATTRS);
-
-  nonnegative_int num_replicas = attrs.reduction_degree;
-
-  return profile(forward_kernel,
-                 profiling_settings,
-                 "[Reduction] forward_time = {:.2lf}ms\n",
-                 input,
-                 output,
-                 num_replicas.unwrap_nonnegative());
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-
-  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
-  return profile(backward_kernel,
-                 profiling,
-                 "[Reduction] backward_time = {:.2lf}ms\n",
-                 input_grad,
-                 output_grad);
-}
-
-TaskImplFunction get_reduction_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-TaskImplFunction get_reduction_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-OpTaskSignature get_reduction_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
-  fwd.add_arg_slot<ReductionAttrs>(ATTRS);
-
-  fwd.add_input_slot(INPUT);
-  fwd.add_output_slot(OUTPUT);
-  return fwd;
-}
-OpTaskSignature get_reduction_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_reduction_fwd_signature());
-  return bwd;
-}
-
-std::vector<task_id_t> get_task_ids(ReductionAttrs const &) {
-  return {task_id_t::REDUCTION_FWD_TASK_ID, task_id_t::REDUCTION_BWD_TASK_ID};
-}
-
-}; // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/repartition.cc b/lib/local-execution/src/local-execution/ops/repartition.cc
deleted file mode 100644
index c1b3bbe3c6..0000000000
--- a/lib/local-execution/src/local-execution/ops/repartition.cc
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "local-execution/ops/repartition.h"
-#include "kernels/partition_kernels.h"
-#include "utils/exception.h"
-#include "utils/hash-utils.h"
-
-namespace FlexFlow {
-
-using namespace FlexFlow::Kernels::Repartition;
-
-enum Slots { INPUT, OUTPUT, ATTRS, PROFILING, HANDLE, PER_DEVICE_STATE };
-
-OpTaskInvocation init(RepartitionAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(HANDLE, ff_handle());
-  binding.bind(INPUT, input_tensor(0));
-
-  return {task_id_t::REPARTITION_INIT_TASK_ID, binding};
-}
-
-OpTaskInvocation forward(RepartitionAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(PROFILING, profiling_settings());
-  binding.bind_arg(ATTRS, attrs);
-  binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_op_state<RepartitionPerDeviceState>());
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-
-  return {task_id_t::REPARTITION_FWD_TASK_ID, binding};
-}
-
-OpTaskInvocation backward(RepartitionAttrs const &attrs) {
-  OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::REPARTITION_BWD_TASK_ID, binding};
-}
-
-static DeviceSpecificDeviceStates
-    init_task_impl(TaskArgumentAccessor const &acc) {
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
-
-  // Note: use the input data type
-
-  RepartitionPerDeviceState per_device_state =
-      init_kernel(handle, input.data_type);
-  return DeviceSpecificDeviceStates{
-      DeviceSpecific<RepartitionPerDeviceState>::create(per_device_state)};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto per_device_state =
-      acc.get_argument<RepartitionPerDeviceState>(PER_DEVICE_STATE);
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-
-  return profile(forward_kernel,
-                 profiling,
-                 "[Reparition/Partition] forward_time = {:.2lf}ms\n",
-                 per_device_state,
-                 input,
-                 output);
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto per_device_state =
-      acc.get_argument<RepartitionPerDeviceState>(PER_DEVICE_STATE);
-  auto input_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
-
-  return profile(backward_kernel,
-                 profiling,
-                 "[Reparition/Partition] backward_time = {:.2lf}ms\n",
-                 per_device_state,
-                 output_grad,
-                 input_grad);
-}
-
-TaskImplFunction get_repartition_init_task_impl() {
-  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
-}
-TaskImplFunction get_repartition_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-TaskImplFunction get_repartition_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-OpTaskSignature get_repartition_init_signature() {
-  OpTaskSignature init(OpTaskType::INIT);
-
-  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
-  init.add_input_slot(INPUT);
-  init.add_return_value<RepartitionPerDeviceState>();
-  return init;
-}
-OpTaskSignature get_repartition_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_input_slot(INPUT);
-  fwd.add_output_slot(OUTPUT);
-  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
-  fwd.add_unchecked_arg_slot<RepartitionPerDeviceState>(PER_DEVICE_STATE);
-  return fwd;
-}
-OpTaskSignature get_repartition_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_repartition_fwd_signature());
-  return bwd;
-}
-
-std::vector<task_id_t> get_task_ids(RepartitionAttrs const &) {
-  return {task_id_t::REPARTITION_INIT_TASK_ID,
-          task_id_t::REPARTITION_FWD_TASK_ID,
-          task_id_t::REPARTITION_BWD_TASK_ID};
-}
-
-}; // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/replicate.cc b/lib/local-execution/src/local-execution/ops/replicate.cc
deleted file mode 100644
index ea5be55409..0000000000
--- a/lib/local-execution/src/local-execution/ops/replicate.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "local-execution/ops/replicate.h"
-#include "kernels/replicate_kernels.h"
-#include "op-attrs/parallel_tensor_shape.h"
-#include "utils/exception.h"
-#include "utils/hash-utils.h"
-
-namespace FlexFlow {
-
-using namespace FlexFlow::Kernels::Replicate;
-
-enum Slots { INPUT, OUTPUT, ATTRS, PROFILING };
-
-OpTaskInvocation forward(ReplicateAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(PROFILING, profiling_settings());
-
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-  binding.bind_arg(ATTRS, attrs);
-
-  return {task_id_t::REPLICATE_FWD_TASK_ID, binding};
-}
-OpTaskInvocation backward(ReplicateAttrs const &attrs) {
-  OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::REPLICATE_BWD_TASK_ID, binding};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-
-  return profile(forward_kernel,
-                 profiling,
-                 "[replicate] forward_time = {:.2lf}ms\n",
-                 input,
-                 output);
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-
-  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
-  auto attrs = acc.get_argument<ReplicateAttrs>(ATTRS);
-
-  return profile(backward_kernel,
-                 profiling,
-                 "[replicate] backward_time = {:.2lf}ms\n",
-                 input_grad,
-                 output_grad,
-                 attrs.replicate_degree.unwrap_nonnegative());
-}
-
-TaskImplFunction get_replicate_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-TaskImplFunction get_replicate_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-OpTaskSignature get_replicate_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_arg_slot<bool>(PROFILING);
-  fwd.add_input_slot(INPUT);
-  fwd.add_output_slot(OUTPUT);
-  return fwd;
-}
-
-OpTaskSignature get_replicate_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_replicate_fwd_signature());
-  return bwd;
-}
-
-std::vector<task_id_t> get_task_ids(ReplicateAttrs const &) {
-  return {task_id_t::REPLICATE_FWD_TASK_ID, task_id_t::REPLICATE_BWD_TASK_ID};
-}
-
-}; // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/reshape.cc b/lib/local-execution/src/local-execution/ops/reshape.cc
deleted file mode 100644
index f04785c904..0000000000
--- a/lib/local-execution/src/local-execution/ops/reshape.cc
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "local-execution/ops/reshape.h"
-#include "kernels/reshape_kernels.h"
-
-namespace FlexFlow {
-
-using namespace FlexFlow::Kernels::Reshape;
-
-enum slots { INPUT, OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE };
-
-OpTaskInvocation init(ReshapeAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(ATTRS, attrs);
-
-  return {task_id_t::RESHAPE_INIT_TASK_ID, binding};
-}
-
-OpTaskInvocation forward(ReshapeAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_op_state<ReshapePerDeviceState>());
-  binding.bind_arg(PROFILING, profiling_settings());
-
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-  return {task_id_t::RESHAPE_FWD_TASK_ID, binding};
-}
-
-OpTaskInvocation backward(ReshapeAttrs const &attrs) {
-  OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::RESHAPE_BWD_TASK_ID, binding};
-}
-
-static DeviceSpecificDeviceStates
-    init_task_impl(TaskArgumentAccessor const &acc) {
-  auto attrs = acc.get_argument<ReshapeAttrs>(ATTRS);
-
-  ReshapePerDeviceState per_device_state = init_kernel(attrs.shape.data_type);
-  return DeviceSpecificDeviceStates{
-      DeviceSpecific<ReshapePerDeviceState>::create(per_device_state)};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  auto per_device_state =
-      acc.get_argument<ReshapePerDeviceState>(PER_DEVICE_STATE);
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-
-  return profile(forward_kernel,
-                 profiling,
-                 "[Reshape] forward time = {:.2lf}ms\n",
-                 per_device_state,
-                 input,
-                 output);
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  auto per_device_state =
-      acc.get_argument<ReshapePerDeviceState>(PER_DEVICE_STATE);
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-
-  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
-
-  return profile(backward_kernel,
-                 profiling,
-                 "[Reshape] backward time = {:.2lf}ms\n",
-                 per_device_state,
-                 input_grad,
-                 output_grad);
-}
-
-TaskImplFunction get_reshape_init_task_impl() {
-  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
-}
-TaskImplFunction get_reshape_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-TaskImplFunction get_reshape_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-OpTaskSignature get_reshape_init_signature() {
-  OpTaskSignature init(OpTaskType::INIT);
-
-  init.add_arg_slot<ReshapeAttrs>(ATTRS);
-
-  init.add_return_value<ReshapePerDeviceState>();
-  return init;
-}
-OpTaskSignature get_reshape_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
-  fwd.add_unchecked_arg_slot<ReshapePerDeviceState>(PER_DEVICE_STATE);
-
-  fwd.add_input_slot(INPUT);
-  fwd.add_output_slot(OUTPUT);
-  return fwd;
-}
-OpTaskSignature get_reshape_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_reshape_fwd_signature());
-  return bwd;
-}
-
-std::vector<task_id_t> get_task_ids(ReshapeAttrs const &) {
-  return {task_id_t::RESHAPE_INIT_TASK_ID,
-          task_id_t::RESHAPE_FWD_TASK_ID,
-          task_id_t::RESHAPE_BWD_TASK_ID};
-}
-
-}; // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/reverse.cc b/lib/local-execution/src/local-execution/ops/reverse.cc
deleted file mode 100644
index 66c0ef7c5e..0000000000
--- a/lib/local-execution/src/local-execution/ops/reverse.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "local-execution/ops/reverse.h"
-#include "kernels/accessor.h"
-#include "kernels/reverse_kernels.h"
-#include "utils/nonnegative_int/nonnegative_range.h"
-
-namespace FlexFlow {
-
-using namespace FlexFlow::Kernels::Reverse;
-using coord_t = long long;
-
-enum Slots { INPUT, OUTPUT, ATTRS, PROFILING };
-
-OpTaskInvocation forward(ReverseAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(PROFILING, profiling_settings());
-  binding.bind_arg(ATTRS, attrs);
-
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-
-  return {task_id_t::REVERSE_FWD_TASK_ID, binding};
-}
-OpTaskInvocation backward(ReverseAttrs const &attrs) {
-  OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::REVERSE_BWD_TASK_ID, binding};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  auto attrs = acc.get_argument<ReverseAttrs>(ATTRS);
-
-  nonnegative_int output_size = output.shape.get_volume();
-  auto axis = attrs.axis;
-  nonnegative_int in_blk_size = 1_n;
-  nonnegative_int reverse_dim_size = 1_n;
-  nonnegative_int num_out_blks = 1_n;
-  for (nonnegative_int i : nonnegative_range(output.shape.get_dim())) {
-    if (i < axis.value) {
-      in_blk_size *= output.shape.at(ff_dim_t{i});
-    } else if (i == axis.value) {
-      reverse_dim_size = output.shape.at(ff_dim_t{i});
-    } else {
-      num_out_blks *= output.shape.at(ff_dim_t{i});
-    }
-  }
-
-  return profile(forward_kernel,
-                 profiling,
-                 "[reverse] forward_time = {:.2lf}ms\n",
-                 input.get_float_ptr(),
-                 output.get_float_ptr(),
-                 num_out_blks.unwrap_nonnegative(),
-                 reverse_dim_size.unwrap_nonnegative(),
-                 in_blk_size.unwrap_nonnegative(),
-                 output_size.unwrap_nonnegative());
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
-  auto attrs = acc.get_argument<ReverseAttrs>(ATTRS);
-
-  int axis = input_grad.shape.num_dims().unwrap_nonnegative() -
-             attrs.axis.value.unwrap_nonnegative() - 1;
-  nonnegative_int in_blk_size = 1_n;
-  nonnegative_int reverse_dim_size = 1_n;
-  nonnegative_int num_out_blks = 1_n;
-  for (nonnegative_int i : nonnegative_range(input_grad.shape.get_dim())) {
-    if (i < axis) {
-      in_blk_size *= input_grad.shape.at(ff_dim_t{i});
-    } else if (i == axis) {
-      reverse_dim_size = input_grad.shape.at(ff_dim_t{i});
-    } else {
-      num_out_blks *= input_grad.shape.at(ff_dim_t{i});
-    }
-  }
-
-  return profile(backward_kernel,
-                 profiling,
-                 "[reverse] backward_time = {:.2lf}ms\n",
-                 output_grad.get_float_ptr(),
-                 input_grad.get_float_ptr(),
-                 num_out_blks.unwrap_nonnegative(),
-                 reverse_dim_size.unwrap_nonnegative(),
-                 in_blk_size.unwrap_nonnegative(),
-                 input_grad.shape.get_volume().unwrap_nonnegative());
-}
-
-TaskImplFunction get_reverse_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-TaskImplFunction get_reverse_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-OpTaskSignature get_reverse_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
-  fwd.add_input_slot(INPUT);
-  fwd.add_output_slot(OUTPUT);
-  return fwd;
-}
-
-OpTaskSignature get_reverse_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_reverse_fwd_signature());
-  return bwd;
-}
-
-std::vector<task_id_t> get_task_ids(ReverseAttrs const &) {
-  return {task_id_t::REVERSE_FWD_TASK_ID, task_id_t::REVERSE_BWD_TASK_ID};
-}
-
-}; // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/softmax.cc b/lib/local-execution/src/local-execution/ops/softmax.cc
deleted file mode 100644
index 02cebfc4a4..0000000000
--- a/lib/local-execution/src/local-execution/ops/softmax.cc
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "local-execution/ops/softmax.h"
-#include "kernels/softmax_kernels.h"
-#include "op-attrs/parallel_tensor_shape.h"
-#include "utils/exception.h"
-#include "utils/hash-utils.h"
-
-namespace FlexFlow {
-using namespace FlexFlow::Kernels::Softmax;
-
-enum Slots { INPUT, OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE, HANDLE };
-
-OpTaskInvocation init(SoftmaxAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(HANDLE, ff_handle());
-  binding.bind_arg(ATTRS, attrs);
-  return {task_id_t::SOFTMAX_INIT_TASK_ID, binding};
-}
-
-OpTaskInvocation forward(SoftmaxAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_op_state<SoftmaxPerDeviceState>());
-  binding.bind_arg(PROFILING, profiling_settings());
-
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-
-  return {task_id_t::SOFTMAX_FWD_TASK_ID, binding};
-}
-
-OpTaskInvocation backward(SoftmaxAttrs const &attrs) {
-  OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::SOFTMAX_BWD_TASK_ID, binding};
-}
-
-static DeviceSpecificDeviceStates
-    init_task_impl(TaskArgumentAccessor const &acc) {
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
-
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  auto const &attrs = acc.get_argument<SoftmaxAttrs>(ATTRS);
-
-  nonnegative_int output_w = output.shape.at(legion_dim_t{0_n});
-  nonnegative_int output_h = output.shape.at(legion_dim_t{1_n});
-  nonnegative_int output_c = output.shape.at(legion_dim_t{2_n});
-  nonnegative_int output_n = output.shape.at(legion_dim_t{3_n});
-
-  SoftmaxPerDeviceState per_device_state =
-      init_kernel(handle,
-                  attrs.dim.value.unwrap_nonnegative(),
-                  output_n.unwrap_nonnegative(),
-                  output_c.unwrap_nonnegative(),
-                  output_h.unwrap_nonnegative(),
-                  output_w.unwrap_nonnegative());
-
-  return DeviceSpecificDeviceStates{
-      DeviceSpecific<SoftmaxPerDeviceState>::create(per_device_state)};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto per_device_state =
-      acc.get_argument<SoftmaxPerDeviceState>(PER_DEVICE_STATE);
-
-  return profile(forward_kernel,
-                 profiling,
-                 "[SoftMax] forward_time = {:.2lf}ms\n",
-                 per_device_state,
-                 input.get_float_ptr(),
-                 output.get_float_ptr());
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-
-  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  assert(input_grad.shape == input.shape);
-
-  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
-  auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
-
-  assert(output_grad.shape == output.shape);
-
-  return profile(backward_kernel,
-                 profiling,
-                 "[SoftMax] backward_time = {:.2lf}ms\n",
-                 input_grad.get_float_ptr(),
-                 output_grad.get_float_ptr(),
-                 output_grad.shape.get_volume().unwrap_nonnegative());
-}
-
-TaskImplFunction get_softmax_init_task_impl() {
-  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
-}
-TaskImplFunction get_softmax_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-TaskImplFunction get_softmax_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-OpTaskSignature get_softmax_init_signature() {
-  OpTaskSignature init(OpTaskType::INIT);
-
-  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
-  init.add_arg_slot<SoftmaxAttrs>(ATTRS);
-  init.add_return_value<SoftmaxPerDeviceState>();
-  return init;
-}
-OpTaskSignature get_softmax_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
-  fwd.add_unchecked_arg_slot<SoftmaxPerDeviceState>(PER_DEVICE_STATE);
-
-  fwd.add_input_slot(INPUT);
-  fwd.add_output_slot(OUTPUT);
-  return fwd;
-}
-OpTaskSignature get_softmax_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_softmax_fwd_signature());
-  return bwd;
-}
-
-std::vector<task_id_t> get_task_ids(SoftmaxAttrs const &) {
-  return {task_id_t::SOFTMAX_INIT_TASK_ID,
-          task_id_t::SOFTMAX_FWD_TASK_ID,
-          task_id_t::SOFTMAX_BWD_TASK_ID};
-}
-
-}; // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/split.cc b/lib/local-execution/src/local-execution/ops/split.cc
deleted file mode 100644
index 5661fa7381..0000000000
--- a/lib/local-execution/src/local-execution/ops/split.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "local-execution/ops/split.h"
-#include "kernels/array_shape.h"
-#include "kernels/split_kernels.h"
-#include "utils/exception.h"
-#include "utils/hash-utils.h"
-#include "utils/nonnegative_int/nonnegative_range.h"
-
-namespace FlexFlow {
-
-using namespace FlexFlow::Kernels::Split;
-using coord_t = long long;
-
-enum Slots { INPUT, OUTPUT, ATTRS, PROFILING };
-
-OpTaskInvocation forward(SplitAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(PROFILING, profiling_settings());
-  binding.bind_arg(ATTRS, attrs);
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-
-  return {task_id_t::SPLIT_FWD_TASK_ID, binding};
-}
-
-OpTaskInvocation backward(SplitAttrs const &attrs) {
-  OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::SPLIT_BWD_TASK_ID, binding};
-}
-
-static std::pair<nonnegative_int, nonnegative_int>
-    calc_block_size(ArrayShape const &array_shape, ff_dim_t axis) {
-  nonnegative_int num_blocks = 1_n;
-  nonnegative_int block_size = 1_n;
-  for (nonnegative_int d : nonnegative_range(array_shape.num_elements())) {
-    if (d <= axis.value) {
-      block_size *= array_shape.at(legion_dim_t{d});
-    } else {
-      num_blocks *= array_shape.at(legion_dim_t{d});
-    }
-  }
-  return {num_blocks, block_size};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  auto attrs = acc.get_argument<SplitAttrs>(ATTRS);
-
-  coord_t out_block_sizes[MAX_NUM_OUTPUTS];
-  auto [num_blocks, in_block_size] = calc_block_size(input.shape, attrs.axis);
-
-  for (int i = 0; i < attrs.splits.size(); i++) {
-    auto [_, out_block_size] = calc_block_size(output.shape, attrs.axis);
-    out_block_sizes[i] = out_block_size.unwrap_nonnegative();
-  }
-  float *output_float_ptr = output.get_float_ptr();
-  return profile(forward_kernel,
-                 profiling,
-                 "Split forward_time = {:.2lf}ms\n",
-                 &output_float_ptr,
-                 input.get_float_ptr(),
-                 out_block_sizes,
-                 in_block_size.unwrap_nonnegative(),
-                 num_blocks.unwrap_nonnegative(),
-                 attrs.splits.size());
-}
-
-// maybe we should add assert like the original code
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
-  auto attrs = acc.get_argument<SplitAttrs>(ATTRS);
-
-  coord_t out_block_sizes[MAX_NUM_OUTPUTS];
-  auto [num_blocks, in_block_size] =
-      calc_block_size(input_grad.shape, attrs.axis);
-
-  for (int i = 0; i < attrs.splits.size(); i++) {
-    coord_t out_num_blocks;
-    auto [_, out_block_size] = calc_block_size(output_grad.shape, attrs.axis);
-    out_block_sizes[i] = out_block_size.unwrap_nonnegative();
-  }
-  float const *output_grad_ptr = output_grad.get_float_ptr();
-  return profile(backward_kernel,
-                 profiling,
-                 "Split backward_time = {:.2lf}ms\n",
-                 input_grad.get_float_ptr(),
-                 &output_grad_ptr,
-                 out_block_sizes,
-                 in_block_size.unwrap_nonnegative(),
-                 num_blocks.unwrap_nonnegative(),
-                 attrs.splits.size());
-}
-
-TaskImplFunction get_split_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-TaskImplFunction get_split_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-OpTaskSignature get_split_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
-
-  fwd.add_input_slot(INPUT);
-  fwd.add_output_slot(OUTPUT);
-  return fwd;
-}
-OpTaskSignature get_split_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_split_fwd_signature());
-  return bwd;
-}
-
-std::vector<task_id_t> get_task_ids(SplitAttrs const &) {
-  return {task_id_t::SPLIT_FWD_TASK_ID, task_id_t::SPLIT_BWD_TASK_ID};
-}
-
-}; // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/topk.cc b/lib/local-execution/src/local-execution/ops/topk.cc
deleted file mode 100644
index fd895605a1..0000000000
--- a/lib/local-execution/src/local-execution/ops/topk.cc
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "local-execution/ops/topk.h"
-#include "kernels/topk_kernels.h"
-#include "utils/exception.h"
-
-namespace FlexFlow {
-
-using namespace FlexFlow::Kernels::TopK;
-
-// For an input tensor, computes the top k entries in each row
-// (resp. vector along the last dimension). Thus,
-// values.shape = indices.shape = input.shape[:-1] + [k]
-
-enum Slots { INPUT, OUTPUT, INDICES, ATTRS, PROFILING, PER_DEVICE_STATE };
-
-OpTaskInvocation init(TopKAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(ATTRS, attrs);
-
-  return {task_id_t::TOPK_INIT_TASK_ID, binding};
-}
-
-OpTaskInvocation forward(TopKAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(PER_DEVICE_STATE, per_device_op_state<TopKPerDeviceState>());
-  binding.bind_arg(PROFILING, profiling_settings());
-  binding.bind_arg(ATTRS, attrs);
-
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-  binding.bind(INDICES, output_tensor(1));
-
-  return {task_id_t::TOPK_FWD_TASK_ID, binding};
-}
-
-OpTaskInvocation backward(TopKAttrs const &attrs) {
-  OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::TOPK_BWD_TASK_ID, binding};
-}
-
-static DeviceSpecificDeviceStates
-    init_task_impl(TaskArgumentAccessor const &acc) {
-
-  auto attrs = acc.get_argument<TopKAttrs>(ATTRS);
-
-  TopKPerDeviceState per_device_state = init_kernel(attrs.sorted);
-  return DeviceSpecificDeviceStates{
-      DeviceSpecific<TopKPerDeviceState>::create(per_device_state)};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  auto attrs = acc.get_argument<TopKAttrs>(ATTRS);
-  auto per_device_state =
-      acc.get_argument<TopKPerDeviceState>(PER_DEVICE_STATE);
-  auto profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-
-  nonnegative_int length = input.shape.at(legion_dim_t{0_n});
-  nonnegative_int batch_size = input.shape.get_volume() / length;
-  auto indices = acc.get_tensor<Permissions::WO>(INDICES);
-
-  return profile(forward_kernel,
-                 profiling,
-                 "[TopK] forward_time = {:.2lf}ms\n",
-                 per_device_state,
-                 input.get_float_ptr(),
-                 output.get_float_ptr(),
-                 indices.get_int32_ptr(),
-                 batch_size.unwrap_nonnegative(),
-                 length.unwrap_nonnegative(),
-                 attrs.k.unwrap_nonnegative(),
-                 attrs.sorted);
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  auto attrs = acc.get_argument<TopKAttrs>(ATTRS);
-  auto per_device_state =
-      acc.get_argument<TopKPerDeviceState>(PER_DEVICE_STATE);
-  auto profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-
-  auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
-
-  auto indices = acc.get_tensor<Permissions::RO>(INDICES);
-
-  nonnegative_int length = input_grad.shape.at(legion_dim_t{0_n});
-  nonnegative_int batch_size = input_grad.shape.get_volume() / length;
-
-  return profile(backward_kernel,
-                 profiling,
-                 "[TopK] backward_time = {:.2lf}ms\n",
-                 per_device_state,
-                 output_grad.get_float_ptr(),
-                 indices.get_int32_ptr(),
-                 input_grad.get_float_ptr(),
-                 batch_size.unwrap_nonnegative(),
-                 length.unwrap_nonnegative(),
-                 attrs.k.unwrap_nonnegative());
-}
-
-TaskImplFunction get_topk_init_task_impl() {
-  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
-}
-TaskImplFunction get_topk_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-TaskImplFunction get_topk_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-OpTaskSignature get_topk_init_signature() {
-  OpTaskSignature init(OpTaskType::INIT);
-
-  init.add_arg_slot<TopKAttrs>(ATTRS);
-  init.add_return_value<TopKPerDeviceState>();
-
-  return init;
-}
-OpTaskSignature get_topk_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
-  fwd.add_arg_slot<TopKAttrs>(ATTRS);
-  fwd.add_unchecked_arg_slot<TopKPerDeviceState>(PER_DEVICE_STATE);
-
-  fwd.add_input_slot(INPUT);
-  fwd.add_output_slot(OUTPUT);
-  fwd.add_output_slot(INDICES);
-  return fwd;
-}
-OpTaskSignature get_topk_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_topk_fwd_signature());
-  return bwd;
-}
-
-std::vector<task_id_t> get_task_ids(TopKAttrs const &) {
-  return {task_id_t::TOPK_INIT_TASK_ID,
-          task_id_t::TOPK_FWD_TASK_ID,
-          task_id_t::TOPK_BWD_TASK_ID};
-}
-
-}; // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/transpose.cc b/lib/local-execution/src/local-execution/ops/transpose.cc
deleted file mode 100644
index 62db7b5266..0000000000
--- a/lib/local-execution/src/local-execution/ops/transpose.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "local-execution/ops/transpose.h"
-#include "kernels/transpose_kernels.h"
-#include "op-attrs/ops/transpose.h"
-#include "utils/integer_conversions.h"
-
-using namespace FlexFlow::Kernels::Transpose;
-
-namespace FlexFlow {
-
-enum Slots {
-  INPUT,  // tensor
-  OUTPUT, // tensor
-  ATTRS,
-  PROFILING,
-};
-
-OpTaskInvocation forward(TransposeAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(PROFILING, profiling_settings());
-
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-
-  return {task_id_t::TRANSPOSE_FWD_TASK_ID, binding};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto attrs = acc.get_argument<TransposeAttrs>(ATTRS);
-
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-
-  return profile(forward_kernel,
-                 profiling,
-                 "[Transpose] Forward_time = {:.2lf} [ms]",
-                 attrs,
-                 input,
-                 output);
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto attrs = acc.get_argument<TransposeAttrs>(ATTRS);
-
-  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
-
-  return profile(backward_kernel,
-                 profiling,
-                 "[Transpose] Backward_time = {:.2lf} [ms]",
-                 attrs,
-                 input_grad,
-                 output_grad);
-}
-
-OpTaskInvocation backward(TransposeAttrs const &attrs) {
-  OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::TRANSPOSE_BWD_TASK_ID, binding};
-}
-
-TaskImplFunction get_transpose_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-
-TaskImplFunction get_transpose_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-OpTaskSignature get_transpose_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
-
-  fwd.add_input_slot(INPUT);
-  fwd.add_output_slot(OUTPUT);
-  return fwd;
-}
-
-OpTaskSignature get_transpose_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_transpose_fwd_signature());
-  return bwd;
-}
-
-std::vector<task_id_t> get_task_ids(TransposeAttrs const &) {
-  return {task_id_t::TRANSPOSE_FWD_TASK_ID, task_id_t::TRANSPOSE_BWD_TASK_ID};
-}
-
-} // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/ops/weight.cc b/lib/local-execution/src/local-execution/ops/weight.cc
deleted file mode 100644
index f96c104f33..0000000000
--- a/lib/local-execution/src/local-execution/ops/weight.cc
+++ /dev/null
@@ -1,9 +0,0 @@
-#include "local-execution/ops/weight.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(WeightAttrs const &attrs) {
-  return {};
-}
-
-}; // namespace FlexFlow
diff --git a/lib/local-execution/src/local_cpu_allocator.cc b/lib/local-execution/src/local_cpu_allocator.cc
deleted file mode 100644
index 4ca5f987a8..0000000000
--- a/lib/local-execution/src/local_cpu_allocator.cc
+++ /dev/null
@@ -1,24 +0,0 @@
-#include "local-execution/local_cpu_allocator.h"
-#include "utils/containers/contains_key.h"
-
-namespace FlexFlow {
-void *LocalCPUAllocator::allocate(size_t requested_memory_size) {
-  void *ptr = malloc(requested_memory_size);
-  this->ptrs.insert({ptr, std::unique_ptr<void, decltype(&free)>(ptr, free)});
-  return ptr;
-}
-
-void LocalCPUAllocator::deallocate(void *ptr) {
-  if (contains_key(this->ptrs, ptr)) {
-    this->ptrs.erase(ptr);
-  } else {
-    throw std::runtime_error(
-        "Deallocating a pointer that was not allocated by this Allocator");
-  }
-}
-
-Allocator create_local_cpu_memory_allocator() {
-  return Allocator::create<LocalCPUAllocator>();
-}
-
-} // namespace FlexFlow
diff --git a/lib/local-execution/src/local_task_argument_accessor.cc b/lib/local-execution/src/local_task_argument_accessor.cc
index e53e3abeff..2e82378fdb 100644
--- a/lib/local-execution/src/local_task_argument_accessor.cc
+++ b/lib/local-execution/src/local_task_argument_accessor.cc
@@ -24,8 +24,8 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor(
   auto tensor_backing = std::get<GenericTensorAccessorW>(
       this->tensor_slots_backing.at(slot_tensor_type));
   if (priv == Permissions::RO) {
-    GenericTensorAccessorR readonly_tensor_backing = {
-        tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr};
+    GenericTensorAccessorR readonly_tensor_backing =
+        read_only_accessor_from_write_accessor(tensor_backing);
     return readonly_tensor_backing;
   } else if (priv == Permissions::RW || priv == Permissions::WO) {
     return tensor_backing;
@@ -33,6 +33,7 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor(
     throw mk_runtime_error(fmt::format("Unhandled privilege mode {}", priv));
   }
 }
+
 VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor(
     slot_id_t slot, Permissions priv, TensorType tensor_type) const {
   SlotTensorTypeId slot_tensor_type = SlotTensorTypeId{slot, tensor_type};
@@ -43,7 +44,7 @@ VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor(
     for (GenericTensorAccessorW const &tensor_backing :
          variadic_tensor_backing) {
       readonly_variadic_tensor_backing.push_back(
-          {tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr});
+          read_only_accessor_from_write_accessor(tensor_backing));
     }
     return readonly_variadic_tensor_backing;
   } else if (priv == Permissions::RW || priv == Permissions::WO) {
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
index d508c34210..3b1bb0fd2d 100644
--- a/lib/local-execution/src/local_training_backing.cc
+++ b/lib/local-execution/src/local_training_backing.cc
@@ -1,12 +1,12 @@
 #include "local-execution/local_training_backing.h"
 #include "local-execution/loss_functions.h"
 #include "local-execution/optimizer.h"
-#include "local-execution/task_signature_impl.h"
 #include "local-execution/unallocated_tensors.h"
 #include "pcg/computation_graph.h"
 #include "pcg/optimizer_attrs.h"
 #include "task-spec/op_task_to_task_invocation.h"
 #include "task-spec/task_invocation.h"
+#include "task-spec/task_signature_impl.h"
 #include "utils/containers/contains.h"
 #include "utils/containers/contains_key.h"
 #include "utils/containers/get_only.h"
diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc
index 15ebdd5f28..4d0b32fd48 100644
--- a/lib/local-execution/src/loss_functions.cc
+++ b/lib/local-execution/src/loss_functions.cc
@@ -16,6 +16,7 @@
 #include "op-attrs/ops/loss_functions.h"
 #include "kernels/loss_function_kernels.h"
 #include "local-execution/loss_functions.h"
+#include "kernels/format_accessor_contents.h"
 #include "task-spec/profiling.h"
 #include "utils/nonnegative_int/nonnegative_int.h"
 
@@ -55,44 +56,41 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
   auto logit_grad = acc.get_tensor_grad<Permissions::RW>(LOGIT_GRAD);
   auto logit = acc.get_tensor<Permissions::RO>(LOGIT);
   auto label = acc.get_loss_tensor<Permissions::RO>(LABEL);
-  int batch_size =
-      logit.shape.at(legion_dim_t{nonnegative_int{1}}).unwrap_nonnegative();
+
+  int batch_size = logit.shape.at(legion_dim_t{1_n}).int_from_positive_int();
   // assuming logit shape is [batch dim, num classes]
 
   LossFunction loss_type = get_loss_function(attrs);
   float scale_factor = 1.0f / batch_size;
   if (loss_type == LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE) {
-    assert(logit.shape.get_volume() == label.shape.get_volume());
-    scale_factor = 2.0f / logit.shape.get_volume().unwrap_nonnegative();
+    ASSERT(logit.shape.num_elements() == label.shape.num_elements());
+    scale_factor = 2.0f / logit.shape.num_elements().int_from_positive_int();
   }
 
   if (loss_type == LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY) {
     // label shape is [batch dim, 1]
     auto scce_attrs = attrs.get<SparseCategoricalCrossEntropyLossAttrs>();
     size_t ndim = logit.shape.num_dims().unwrap_nonnegative();
-    int num_classes =
-        logit.shape.at(legion_dim_t{nonnegative_int{0}}).unwrap_nonnegative();
-    assert(logit_grad.shape == logit.shape);
+    int num_classes = logit.shape.at(legion_dim_t{0_n}).int_from_positive_int();
+    ASSERT(logit_grad.shape == logit.shape);
     int k = 1;
     if (scce_attrs.replace_labels) {
       k = logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1}))
-              .unwrap_nonnegative() /
+              .int_from_positive_int() /
           label.shape.at(legion_dim_t(nonnegative_int{ndim - 1}))
-              .unwrap_nonnegative(); // TODO FIXME something seems wrong here,
-                                     // isn't the numerator guaranteed to be 1?
-                                     // <--- this is not the case because of the
-                                     // potential parallel dim
+              .int_from_positive_int(); // TODO FIXME something seems wrong
+                                        // here, isn't the numerator guaranteed
+                                        // to be 1?
+                                        // <--- this is not the case because of
+                                        // the potential parallel dim
     }
-    assert(
-        label.shape.sub_shape(legion_dim_t(nonnegative_int{1}), std::nullopt) ==
-        logit.shape.sub_shape(legion_dim_t(nonnegative_int{1}), std::nullopt));
-    assert(k * label.shape.at(legion_dim_t(nonnegative_int{ndim - 1}))
-                   .unwrap_nonnegative() ==
+    ASSERT(label.shape.sub_shape(legion_dim_t(1_n), std::nullopt) ==
+           logit.shape.sub_shape(legion_dim_t(1_n), std::nullopt));
+    ASSERT(k * label.shape.at(legion_dim_t(nonnegative_int{ndim - 1}))
+                   .int_from_positive_int() ==
            logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1}))
-               .unwrap_nonnegative());
-    assert(
-        label.shape.at(legion_dim_t(nonnegative_int{0})).unwrap_nonnegative() ==
-        1);
+               .int_from_positive_int());
+    ASSERT(label.shape.at(legion_dim_t(0_n)).int_from_positive_int() == 1);
 
     profile(sparse_categorical_crossentropy_loss_backward_kernel,
             profiling,
@@ -100,28 +98,34 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
             get_float_ptr(logit_grad),
             get_float_ptr(logit),
             reinterpret_cast<int const *>(get_float_ptr(label)),
-            get_volume(logit.shape).unwrap_nonnegative(),
-            get_volume(logit_grad.shape).unwrap_nonnegative(),
+            get_num_elements(logit.shape).int_from_positive_int(),
+            get_num_elements(logit_grad.shape).int_from_positive_int(),
             batch_size,
             num_classes,
             k,
             scale_factor);
   } else {
-    assert(logit.shape == label.shape);
-    assert(logit_grad.shape == logit.shape);
+    ASSERT(logit.shape == label.shape);
+    ASSERT(logit_grad.shape == logit.shape);
     int num_channels =
-        logit.shape.at(legion_dim_t{nonnegative_int{0}}).unwrap_nonnegative();
+        logit.shape.at(legion_dim_t{0_n}).int_from_positive_int();
     switch (loss_type) {
       case LossFunction::CATEGORICAL_CROSSENTROPY: {
+        size_t logit_volume = get_num_elements(logit.shape).int_from_positive_int();
+        size_t logit_grad_volume =
+            get_num_elements(logit_grad.shape).int_from_positive_int();
+
         profile(categorical_crossentropy_loss_backward_kernel,
                 profiling,
                 "[CategoricalCrossEntropyLoss] backward_time = %.2lfms\n",
                 get_float_ptr(logit_grad),
                 get_float_ptr(logit),
                 get_float_ptr(label),
-                get_volume(logit.shape).unwrap_nonnegative(),
-                get_volume(logit_grad.shape).unwrap_nonnegative(),
+                logit_volume,
+                logit_grad_volume,
                 scale_factor);
+
+        
         break;
       }
       case LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE: {
@@ -131,8 +135,8 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
                 get_float_ptr(logit_grad),
                 get_float_ptr(logit),
                 get_float_ptr(label),
-                get_volume(logit.shape).unwrap_nonnegative(),
-                get_volume(logit_grad.shape).unwrap_nonnegative(),
+                get_num_elements(logit.shape).int_from_positive_int(),
+                get_num_elements(logit_grad.shape).int_from_positive_int(),
                 scale_factor);
         break;
       }
@@ -142,13 +146,13 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
                 "[IdentityLoss] backward_time = %.2lfms\n",
                 get_float_ptr(logit_grad),
                 get_float_ptr(logit),
-                get_volume(logit.shape).unwrap_nonnegative(),
-                get_volume(logit_grad.shape).unwrap_nonnegative(),
+                get_num_elements(logit.shape).int_from_positive_int(),
+                get_num_elements(logit_grad.shape).int_from_positive_int(),
                 scale_factor);
         break;
       }
       default:
-        throw mk_runtime_error(fmt::format(
+        PANIC(fmt::format(
             "Unsupported loss function {}. Please report this as an issue.",
             loss_type));
     }
diff --git a/lib/local-execution/src/loss_tensor_source.cc b/lib/local-execution/src/loss_tensor_source.cc
index da1efa6b85..f5ce639087 100644
--- a/lib/local-execution/src/loss_tensor_source.cc
+++ b/lib/local-execution/src/loss_tensor_source.cc
@@ -2,7 +2,7 @@
 
 namespace FlexFlow {
 
-size_t LossTensorSource::next_available_loss_tensor_id = 0;
+nonnegative_int LossTensorSource::next_available_loss_tensor_id = 0_n;
 
 LossTensorSource::LossTensorSource() {}
 
diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc
index d214d0d426..d3c1c65a68 100644
--- a/lib/local-execution/src/model_training_instance.cc
+++ b/lib/local-execution/src/model_training_instance.cc
@@ -1,4 +1,5 @@
 #include "local-execution/model_training_instance.h"
+#include "kernels/format_accessor_contents.h"
 #include "pcg/computation_graph.h"
 #include "pcg/optimizer_attrs.h"
 #include "utils/containers/reversed.h"
@@ -34,6 +35,13 @@ PerLayerElapsedTime ModelTrainingInstance::backward() {
                this->label_tensor,
                this->allocator);
 
+  gradient_tensor_t loss_tensor =
+      this->training_backing.local_tensor_backing.tensor_gradient_mapping.at(
+          this->logit_tensor);
+  GenericTensorAccessorW loss_tensor_backing =
+      this->training_backing.local_tensor_backing.tensor_backings.at(
+          TensorTypeVariant{loss_tensor});
+
   PerLayerElapsedTime per_layer_elapsed_time;
   for (layer_guid_t const &node : reversed(
            topological_ordering(this->training_backing.computation_graph))) {
@@ -54,14 +62,19 @@ void ModelTrainingInstance::update() {
       get_optimizer_attrs_for_next_iter(this->optimizer_attrs);
 }
 
-void ModelTrainingInstance::write_loss_tensor_to_host(float *host_ptr) {
+GenericTensorAccessorR ModelTrainingInstance::get_loss_tensor_accessor() const {
+  GenericTensorAccessorW logit_tensor_backing = this->training_backing
+      .local_tensor_backing.tensor_backings.at(TensorTypeVariant{this->logit_tensor});
+
+
   gradient_tensor_t loss_tensor =
-      this->training_backing.local_tensor_backing
-          .tensor_gradient_mapping.at(this->logit_tensor);
+      this->training_backing.local_tensor_backing.tensor_gradient_mapping.at(
+          this->logit_tensor);
   GenericTensorAccessorW loss_tensor_backing =
       this->training_backing.local_tensor_backing.tensor_backings.at(
           TensorTypeVariant{loss_tensor});
-  write_to_host_float_ptr(loss_tensor_backing, host_ptr);
+  
+  return read_only_accessor_from_write_accessor(loss_tensor_backing);
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc
index 1b8fc37b2d..1d65172e67 100644
--- a/lib/local-execution/src/optimizer.cc
+++ b/lib/local-execution/src/optimizer.cc
@@ -66,18 +66,18 @@ static void sgd_update_task_impl(TaskArgumentAccessor const &acc) {
   auto weight = acc.get_tensor<Permissions::RW>(WEIGHT);
   auto profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
-  assert(weight.shape == weight_grad.shape);
-  int size = weight_grad.shape.get_volume().unwrap_nonnegative();
+  ASSERT(weight.shape == weight_grad.shape);
+  int size = weight_grad.shape.num_elements().int_from_positive_int();
 
-  assert(weight_grad.shape.get_volume().unwrap_nonnegative() &
-         weight.shape.get_volume().unwrap_nonnegative());
-  int num_replicas = weight_grad.shape.get_volume().unwrap_nonnegative() /
-                     weight.shape.get_volume().unwrap_nonnegative();
+  ASSERT(weight_grad.shape.num_elements().int_from_positive_int() &
+         weight.shape.num_elements().int_from_positive_int());
+  int num_replicas = weight_grad.shape.num_elements().int_from_positive_int() /
+                     weight.shape.num_elements().int_from_positive_int();
 
   float *sgd_v_ptr;
   if (attrs.momentum > 0.0f) {
     auto sgd_v = acc.get_optimizer_tensor<Permissions::RW>(SGD_V);
-    assert(sgd_v.shape == weight.shape);
+    ASSERT(sgd_v.shape == weight.shape);
     sgd_v_ptr = sgd_v.get_float_ptr();
   }
 
@@ -180,14 +180,10 @@ static void adam_update_task_impl(TaskArgumentAccessor const &acc) {
 
   auto profiling = acc.get_argument<ProfilingSettings>(PROFILING);
 
-  assert(weight.shape == weight_grad.shape);
-  int size = weight_grad.shape.get_volume().unwrap_nonnegative();
+  ASSERT(weight.shape == weight_grad.shape);
+  int size = weight_grad.shape.num_elements().int_from_positive_int();
 
-  assert(weight_grad.shape.get_volume().unwrap_nonnegative() %
-             weight.shape.get_volume().unwrap_nonnegative() ==
-         0);
-  int num_replicas = weight_grad.shape.get_volume().unwrap_nonnegative() /
-                     weight.shape.get_volume().unwrap_nonnegative();
+  ASSERT(weight_grad.shape.num_elements() % weight.shape.num_elements() == 0);
 
   auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
   profile(adam_nccl_update_task_gpu,
@@ -198,9 +194,9 @@ static void adam_update_task_impl(TaskArgumentAccessor const &acc) {
           attrs.beta2,
           attrs.weight_decay,
           attrs.epsilon,
-          size,
           handle,
           weight_grad.get_float_ptr(),
+          size,
           m_tensor.get_float_ptr(),
           v_tensor.get_float_ptr(),
           weight.get_float_ptr()); // how to deal with removal of ParamSync?
diff --git a/lib/task-spec/src/per_device_op_state.cc b/lib/local-execution/src/per_device_op_state.cc
similarity index 100%
rename from lib/task-spec/src/per_device_op_state.cc
rename to lib/local-execution/src/per_device_op_state.cc
diff --git a/lib/local-execution/src/permissions.cc b/lib/local-execution/src/permissions.cc
deleted file mode 100644
index 2286215987..0000000000
--- a/lib/local-execution/src/permissions.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-#include "local-execution/permissions.h"
-#include "utils/exception.h"
-
-namespace FlexFlow {
-
-Permissions join(Permissions lhs, Permissions rhs) {
-  if (lhs <= rhs) {
-    return rhs;
-  } else if (rhs <= lhs) {
-    return lhs;
-  } else {
-    return Permissions::RW;
-  }
-}
-
-Permissions meet(Permissions lhs, Permissions rhs) {
-  if (lhs <= rhs) {
-    return lhs;
-  } else if (rhs <= lhs) {
-    return rhs;
-  } else {
-    return Permissions::NONE;
-  }
-}
-
-static int as_int(Permissions p) {
-  switch (p) {
-    case Permissions::NONE:
-      return 0;
-    case Permissions::RO:
-    case Permissions::WO:
-      return 1;
-    case Permissions::RW:
-      return 2;
-    default:
-      throw mk_runtime_error(
-          fmt::format("Unknown permission {}", static_cast<int>(p)));
-  }
-}
-
-static bool comparable(Permissions lhs, Permissions rhs) {
-  return !(lhs == Permissions::RO && rhs == Permissions::WO ||
-           lhs == Permissions::WO && rhs == Permissions::RO);
-}
-
-bool operator<(Permissions lhs, Permissions rhs) {
-  if (!comparable(lhs, rhs)) {
-    return false;
-  }
-  int lhs_int = as_int(lhs);
-  int rhs_int = as_int(rhs);
-  return lhs_int < rhs_int;
-}
-
-bool operator<=(Permissions lhs, Permissions rhs) {
-  return (lhs < rhs) || (lhs == rhs);
-}
-
-bool operator>(Permissions lhs, Permissions rhs) {
-  if (!comparable(lhs, rhs)) {
-    return false;
-  }
-  int lhs_int = as_int(lhs);
-  int rhs_int = as_int(rhs);
-  return lhs_int > rhs_int;
-}
-
-bool operator>=(Permissions lhs, Permissions rhs) {
-  return (lhs > rhs) || (lhs == rhs);
-}
-
-} // namespace FlexFlow
diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc
index 2787342a5f..ae3d97daa4 100644
--- a/lib/local-execution/src/task_registry.cc
+++ b/lib/local-execution/src/task_registry.cc
@@ -1,6 +1,6 @@
 #include "local-execution/task_registry.h"
-#include "local-execution/task_signature_impl.h"
 #include "pcg/computation_graph.h"
+#include "task-spec/task_signature_impl.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/task_signature_impl.cc b/lib/local-execution/src/task_signature_impl.cc
deleted file mode 100644
index 9031d2a015..0000000000
--- a/lib/local-execution/src/task_signature_impl.cc
+++ /dev/null
@@ -1,366 +0,0 @@
-#include "local-execution/task_signature_impl.h"
-#include "local-execution/ops/attention.h"
-#include "local-execution/ops/batch_matmul.h"
-#include "local-execution/ops/batch_norm.h"
-#include "local-execution/ops/cast.h"
-#include "local-execution/ops/combine.h"
-#include "local-execution/ops/concat.h"
-#include "local-execution/ops/conv_2d.h"
-#include "local-execution/ops/dropout.h"
-#include "local-execution/ops/element_binary.h"
-#include "local-execution/ops/element_unary.h"
-#include "local-execution/ops/embedding.h"
-#include "local-execution/ops/flat.h"
-#include "local-execution/ops/gather.h"
-#include "local-execution/ops/input.h"
-#include "local-execution/ops/layer_norm.h"
-#include "local-execution/ops/linear.h"
-#include "local-execution/ops/noop.h"
-#include "local-execution/ops/pool_2d.h"
-#include "local-execution/ops/reduce.h"
-#include "local-execution/ops/reduction.h"
-#include "local-execution/ops/repartition.h"
-#include "local-execution/ops/replicate.h"
-#include "local-execution/ops/reshape.h"
-#include "local-execution/ops/reverse.h"
-#include "local-execution/ops/softmax.h"
-#include "local-execution/ops/split.h"
-#include "local-execution/ops/topk.h"
-#include "local-execution/ops/transpose.h"
-#include "local-execution/ops/weight.h"
-#include "utils/overload.h"
-
-namespace FlexFlow {
-
-TaskSignatureAndImpl get_task_sig_impl(task_id_t const &task_id) {
-  switch (task_id) {
-    case task_id_t::ELEMENTBINARY_INIT_TASK_ID:
-      return TaskSignatureAndImpl{get_element_binary_init_task_impl(),
-                                  get_element_binary_init_signature()};
-    case task_id_t::ELEMENTBINARY_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_element_binary_fwd_task_impl(),
-                                  get_element_binary_fwd_signature()};
-    case task_id_t::ELEMENTBINARY_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_element_binary_bwd_task_impl(),
-                                  get_element_binary_bwd_signature()};
-    case task_id_t::ELEMENTUNARY_INIT_TASK_ID:
-      return TaskSignatureAndImpl{get_element_unary_init_task_impl(),
-                                  get_element_unary_init_signature()};
-    case task_id_t::ELEMENTUNARY_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_element_unary_fwd_task_impl(),
-                                  get_element_unary_fwd_signature()};
-    case task_id_t::ELEMENTUNARY_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_element_unary_bwd_task_impl(),
-                                  get_element_unary_bwd_signature()};
-    case task_id_t::CONV2D_INIT_TASK_ID:
-      return TaskSignatureAndImpl{get_conv_2d_init_task_impl(),
-                                  get_conv_2d_init_signature()};
-    case task_id_t::CONV2D_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_conv_2d_fwd_task_impl(),
-                                  get_conv_2d_fwd_signature()};
-    case task_id_t::CONV2D_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_conv_2d_bwd_task_impl(),
-                                  get_conv_2d_bwd_signature()};
-    case task_id_t::DROPOUT_INIT_TASK_ID:
-      return TaskSignatureAndImpl{get_dropout_init_task_impl(),
-                                  get_dropout_init_signature()};
-    case task_id_t::DROPOUT_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_dropout_fwd_task_impl(),
-                                  get_dropout_fwd_signature()};
-    case task_id_t::DROPOUT_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_dropout_bwd_task_impl(),
-                                  get_dropout_bwd_signature()};
-    // case task_id_t::EMBED_FWD_TASK_ID:
-    //   return TaskSignatureAndImpl{get_embedding_fwd_task_impl(),
-    //   get_embedding_fwd_signature()};
-    // case task_id_t::EMBED_BWD_TASK_ID:
-    //   return TaskSignatureAndImpl{get_embedding_bwd_task_impl(),
-    //   get_embedding_bwd_signature()};
-    case task_id_t::GATHER_INIT_TASK_ID:
-      return TaskSignatureAndImpl{get_gather_init_task_impl(),
-                                  get_gather_init_signature()};
-    case task_id_t::GATHER_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_gather_fwd_task_impl(),
-                                  get_gather_fwd_signature()};
-    case task_id_t::GATHER_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_gather_bwd_task_impl(),
-                                  get_gather_bwd_signature()};
-    case task_id_t::CAST_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_cast_fwd_task_impl(),
-                                  get_cast_fwd_signature()};
-    case task_id_t::CAST_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_cast_bwd_task_impl(),
-                                  get_cast_bwd_signature()};
-    case task_id_t::POOL2D_INIT_TASK_ID:
-      return TaskSignatureAndImpl{get_pool_2d_init_task_impl(),
-                                  get_pool_2d_init_signature()};
-    case task_id_t::POOL2D_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_pool_2d_fwd_task_impl(),
-                                  get_pool_2d_fwd_signature()};
-    case task_id_t::POOL2D_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_pool_2d_bwd_task_impl(),
-                                  get_pool_2d_bwd_signature()};
-    case task_id_t::BATCHNORM_INIT_TASK_ID:
-      return TaskSignatureAndImpl{get_batch_norm_init_task_impl(),
-                                  get_batch_norm_init_signature()};
-    case task_id_t::BATCHNORM_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_batch_norm_fwd_task_impl(),
-                                  get_batch_norm_fwd_signature()};
-    case task_id_t::BATCHNORM_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_batch_norm_bwd_task_impl(),
-                                  get_batch_norm_bwd_signature()};
-    case task_id_t::BATCHMATMUL_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_batch_matmul_fwd_task_impl(),
-                                  get_batch_matmul_fwd_signature()};
-    case task_id_t::BATCHMATMUL_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_batch_matmul_bwd_task_impl(),
-                                  get_batch_matmul_bwd_signature()};
-    case task_id_t::LAYERNORM_INIT_TASK_ID:
-      return TaskSignatureAndImpl{get_layer_norm_init_task_impl(),
-                                  get_layer_norm_init_signature()};
-    case task_id_t::LAYERNORM_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_layer_norm_fwd_task_impl(),
-                                  get_layer_norm_init_signature()};
-    case task_id_t::LAYERNORM_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_layer_norm_bwd_task_impl(),
-                                  get_layer_norm_bwd_signature()};
-    case task_id_t::LINEAR_INIT_TASK_ID:
-      return TaskSignatureAndImpl{get_linear_init_task_impl(),
-                                  get_linear_init_signature()};
-    case task_id_t::LINEAR_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_linear_fwd_task_impl(),
-                                  get_linear_fwd_signature()};
-    case task_id_t::LINEAR_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_linear_bwd_task_impl(),
-                                  get_linear_bwd_signature()};
-    case task_id_t::FLAT_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_flat_fwd_task_impl(),
-                                  get_flat_fwd_signature()};
-    case task_id_t::FLAT_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_flat_bwd_task_impl(),
-                                  get_flat_bwd_signature()};
-    case task_id_t::SOFTMAX_INIT_TASK_ID:
-      return TaskSignatureAndImpl{get_softmax_init_task_impl(),
-                                  get_softmax_init_signature()};
-    case task_id_t::SOFTMAX_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_softmax_fwd_task_impl(),
-                                  get_softmax_fwd_signature()};
-    case task_id_t::SOFTMAX_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_softmax_bwd_task_impl(),
-                                  get_softmax_bwd_signature()};
-    case task_id_t::CONCAT_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_concat_fwd_task_impl(),
-                                  get_concat_fwd_signature()};
-    case task_id_t::CONCAT_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_concat_bwd_task_impl(),
-                                  get_concat_bwd_signature()};
-    case task_id_t::SPLIT_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_split_fwd_task_impl(),
-                                  get_split_fwd_signature()};
-    case task_id_t::SPLIT_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_split_bwd_task_impl(),
-                                  get_split_bwd_signature()};
-    case task_id_t::REDUCE_INIT_TASK_ID:
-      return TaskSignatureAndImpl{get_reduce_init_task_impl(),
-                                  get_reduce_init_signature()};
-    case task_id_t::REDUCE_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_reduce_fwd_task_impl(),
-                                  get_reduce_fwd_signature()};
-    case task_id_t::REDUCE_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_reduce_bwd_task_impl(),
-                                  get_reduce_bwd_signature()};
-    case task_id_t::RESHAPE_INIT_TASK_ID:
-      return TaskSignatureAndImpl{get_reshape_init_task_impl(),
-                                  get_reshape_init_signature()};
-    case task_id_t::RESHAPE_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_reshape_fwd_task_impl(),
-                                  get_reshape_fwd_signature()};
-    case task_id_t::RESHAPE_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_reshape_bwd_task_impl(),
-                                  get_reshape_bwd_signature()};
-    case task_id_t::REVERSE_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_reverse_fwd_task_impl(),
-                                  get_reverse_fwd_signature()};
-    case task_id_t::REVERSE_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_reverse_bwd_task_impl(),
-                                  get_reverse_bwd_signature()};
-    case task_id_t::TOPK_INIT_TASK_ID:
-      return TaskSignatureAndImpl{get_topk_init_task_impl(),
-                                  get_topk_init_signature()};
-    case task_id_t::TOPK_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_topk_fwd_task_impl(),
-                                  get_topk_fwd_signature()};
-    case task_id_t::TOPK_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_topk_bwd_task_impl(),
-                                  get_topk_bwd_signature()};
-    case task_id_t::TRANSPOSE_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_transpose_fwd_task_impl(),
-                                  get_transpose_fwd_signature()};
-    case task_id_t::TRANSPOSE_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_transpose_bwd_task_impl(),
-                                  get_transpose_bwd_signature()};
-    case task_id_t::ATTENTION_INIT_TASK_ID:
-      return TaskSignatureAndImpl{get_attention_init_task_impl(),
-                                  get_attention_init_signature()};
-    case task_id_t::ATTENTION_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_attention_fwd_task_impl(),
-                                  get_attention_fwd_signature()};
-    case task_id_t::ATTENTION_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_attention_bwd_task_impl(),
-                                  get_attention_bwd_signature()};
-    case task_id_t::COMBINE_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_combine_fwd_task_impl(),
-                                  get_combine_fwd_signature()};
-    case task_id_t::COMBINE_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_combine_bwd_task_impl(),
-                                  get_combine_bwd_signature()};
-    case task_id_t::REDUCTION_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_reduction_fwd_task_impl(),
-                                  get_reduction_fwd_signature()};
-    case task_id_t::REDUCTION_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_reduction_bwd_task_impl(),
-                                  get_reduction_bwd_signature()};
-    case task_id_t::REPARTITION_INIT_TASK_ID:
-      return TaskSignatureAndImpl{get_repartition_init_task_impl(),
-                                  get_repartition_init_signature()};
-    case task_id_t::REPARTITION_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_repartition_fwd_task_impl(),
-                                  get_repartition_fwd_signature()};
-    case task_id_t::REPARTITION_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_repartition_bwd_task_impl(),
-                                  get_repartition_bwd_signature()};
-    case task_id_t::REPLICATE_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_replicate_fwd_task_impl(),
-                                  get_replicate_fwd_signature()};
-    case task_id_t::REPLICATE_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_replicate_bwd_task_impl(),
-                                  get_replicate_bwd_signature()};
-    default:
-      throw mk_runtime_error(
-          fmt::format("Invalid task ID")); // inserting task_id yields
-                                           // "type_is_unformattable" error
-  }
-}
-
-std::vector<task_id_t> get_task_ids(ComputationGraphOpAttrs const &op) {
-  return op.visit<std::vector<task_id_t>>(overload{
-      [](BatchMatmulAttrs const &attrs) { return get_task_ids(attrs); },
-      [](BatchNormAttrs const &attrs) { return get_task_ids(attrs); },
-      [](CastAttrs const &attrs) { return get_task_ids(attrs); },
-      [](ConcatAttrs const &attrs) { return get_task_ids(attrs); },
-      [](Conv2DAttrs const &attrs) { return get_task_ids(attrs); },
-      [](DropoutAttrs const &attrs) { return get_task_ids(attrs); },
-      [](ElementBinaryAttrs const &attrs) { return get_task_ids(attrs); },
-      [](ElementUnaryAttrs const &attrs) { return get_task_ids(attrs); },
-      // [](EmbeddingAttrs const & attrs) {
-      //   return get_task_ids(attrs);
-      // },
-      [](FlatAttrs const &attrs) { return get_task_ids(attrs); },
-      [](GatherAttrs const &attrs) { return get_task_ids(attrs); },
-      [](InputAttrs const &attrs) { return get_task_ids(attrs); },
-      [](LayerNormAttrs const &attrs) { return get_task_ids(attrs); },
-      [](LinearAttrs const &attrs) { return get_task_ids(attrs); },
-      [](MultiHeadAttentionAttrs const &attrs) { return get_task_ids(attrs); },
-      [](NoopAttrs const &attrs) { return get_task_ids(attrs); },
-      [](Pool2DAttrs const &attrs) { return get_task_ids(attrs); },
-      [](ReduceAttrs const &attrs) { return get_task_ids(attrs); },
-      [](ReverseAttrs const &attrs) { return get_task_ids(attrs); },
-      [](ReshapeAttrs const &attrs) { return get_task_ids(attrs); },
-      [](SplitAttrs const &attrs) { return get_task_ids(attrs); },
-      [](SoftmaxAttrs const &attrs) { return get_task_ids(attrs); },
-      [](TopKAttrs const &attrs) { return get_task_ids(attrs); },
-      [](TransposeAttrs const &attrs) { return get_task_ids(attrs); },
-      [](WeightAttrs const &attrs) { return get_task_ids(attrs); },
-      [](auto const &attrs) -> std::vector<task_id_t> {
-        throw mk_runtime_error(fmt::format("Unhandled attr type: {}", attrs));
-      },
-  });
-}
-
-OpTaskInvocation init(ComputationGraphOpAttrs const &op) {
-  return op.visit<OpTaskInvocation>(overload{
-      [](BatchNormAttrs const &attrs) { return init(attrs); },
-      [](Conv2DAttrs const &attrs) { return init(attrs); },
-      [](DropoutAttrs const &attrs) { return init(attrs); },
-      [](ElementBinaryAttrs const &attrs) { return init(attrs); },
-      [](ElementUnaryAttrs const &attrs) { return init(attrs); },
-      [](GatherAttrs const &attrs) { return init(attrs); },
-      [](LayerNormAttrs const &attrs) { return init(attrs); },
-      [](LinearAttrs const &attrs) { return init(attrs); },
-      [](MultiHeadAttentionAttrs const &attrs) { return init(attrs); },
-      [](Pool2DAttrs const &attrs) { return init(attrs); },
-      [](ReduceAttrs const &attrs) { return init(attrs); },
-      [](ReshapeAttrs const &attrs) { return init(attrs); },
-      [](SoftmaxAttrs const &attrs) { return init(attrs); },
-      [](TopKAttrs const &attrs) { return init(attrs); },
-      [](auto const &attrs) -> OpTaskInvocation {
-        throw mk_runtime_error(fmt::format("Unhandled attr type {}", attrs));
-      },
-  });
-}
-
-OpTaskInvocation forward(ComputationGraphOpAttrs const &op) {
-  return op.visit<OpTaskInvocation>(overload{
-      [](BatchMatmulAttrs const &attrs) { return forward(attrs); },
-      [](BatchNormAttrs const &attrs) { return forward(attrs); },
-      [](CastAttrs const &attrs) { return forward(attrs); },
-      [](ConcatAttrs const &attrs) { return forward(attrs); },
-      [](Conv2DAttrs const &attrs) { return forward(attrs); },
-      [](DropoutAttrs const &attrs) { return forward(attrs); },
-      [](ElementBinaryAttrs const &attrs) { return forward(attrs); },
-      [](ElementUnaryAttrs const &attrs) { return forward(attrs); },
-      // [](EmbeddingAttrs const & attrs) {
-      //   return forward(attrs);
-      // },
-      [](FlatAttrs const &attrs) { return forward(attrs); },
-      [](GatherAttrs const &attrs) { return forward(attrs); },
-      [](LayerNormAttrs const &attrs) { return forward(attrs); },
-      [](LinearAttrs const &attrs) { return forward(attrs); },
-      [](MultiHeadAttentionAttrs const &attrs) { return forward(attrs); },
-      [](Pool2DAttrs const &attrs) { return forward(attrs); },
-      [](ReduceAttrs const &attrs) { return forward(attrs); },
-      [](ReverseAttrs const &attrs) { return forward(attrs); },
-      [](ReshapeAttrs const &attrs) { return forward(attrs); },
-      [](SplitAttrs const &attrs) { return forward(attrs); },
-      [](SoftmaxAttrs const &attrs) { return forward(attrs); },
-      [](TopKAttrs const &attrs) { return forward(attrs); },
-      [](TransposeAttrs const &attrs) { return forward(attrs); },
-      [](auto const &attrs) -> OpTaskInvocation {
-        throw mk_runtime_error(fmt::format("Unhandled attr type {}", attrs));
-      },
-  });
-}
-
-OpTaskInvocation backward(ComputationGraphOpAttrs const &op) {
-  return op.visit<OpTaskInvocation>(overload{
-      [](BatchMatmulAttrs const &attrs) { return backward(attrs); },
-      [](BatchNormAttrs const &attrs) { return backward(attrs); },
-      [](CastAttrs const &attrs) { return backward(attrs); },
-      [](ConcatAttrs const &attrs) { return backward(attrs); },
-      [](Conv2DAttrs const &attrs) { return backward(attrs); },
-      [](DropoutAttrs const &attrs) { return backward(attrs); },
-      [](ElementBinaryAttrs const &attrs) { return backward(attrs); },
-      [](ElementUnaryAttrs const &attrs) { return backward(attrs); },
-      // [](EmbeddingAttrs const & attrs) {
-      //   return backward(attrs);
-      // },
-      [](FlatAttrs const &attrs) { return backward(attrs); },
-      [](GatherAttrs const &attrs) { return backward(attrs); },
-      [](LayerNormAttrs const &attrs) { return backward(attrs); },
-      [](LinearAttrs const &attrs) { return backward(attrs); },
-      [](MultiHeadAttentionAttrs const &attrs) { return backward(attrs); },
-      [](Pool2DAttrs const &attrs) { return backward(attrs); },
-      [](ReduceAttrs const &attrs) { return backward(attrs); },
-      [](ReverseAttrs const &attrs) { return backward(attrs); },
-      [](ReshapeAttrs const &attrs) { return backward(attrs); },
-      [](SplitAttrs const &attrs) { return backward(attrs); },
-      [](SoftmaxAttrs const &attrs) { return backward(attrs); },
-      [](TopKAttrs const &attrs) { return backward(attrs); },
-      [](TransposeAttrs const &attrs) { return backward(attrs); },
-      [](auto const &attrs) -> OpTaskInvocation {
-        throw mk_runtime_error(fmt::format("Unhandled attr type {}", attrs));
-      },
-  });
-}
-
-} // namespace FlexFlow
diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc
index e6c3a11711..ed181aea32 100644
--- a/lib/local-execution/src/tracked_allocator.cc
+++ b/lib/local-execution/src/tracked_allocator.cc
@@ -23,8 +23,13 @@ size_t TrackedAllocator::get_current_mem_usage() {
   return this->current_mem_usage;
 }
 
+DeviceType TrackedAllocator::get_allocation_device_type() const {
+  return this->allocator.get_allocation_device_type();
+}
+
 Allocator get_tracked_memory_allocator(Allocator const &base_allocator) {
-  return Allocator::create<TrackedAllocator>(base_allocator);
+  Allocator allocator = Allocator::create<TrackedAllocator>(base_allocator);
+  return allocator;
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/unallocated_tensors.cc b/lib/local-execution/src/unallocated_tensors.cc
index 363d1eedef..b8daa90e3b 100644
--- a/lib/local-execution/src/unallocated_tensors.cc
+++ b/lib/local-execution/src/unallocated_tensors.cc
@@ -70,7 +70,6 @@ UnallocatedTensors generate_unallocated_tensors_with_optimizer(
         num_optimizer_tensors_to_allocate -=
             allocated_tensors.optimizer_mapping.at(tensor_guid).size();
       }
-      std::cout << num_optimizer_tensors_to_allocate;
 
       for (int i = 0; i < num_optimizer_tensors_to_allocate; ++i) {
         optimizer_tensor_t optimizer_tensor =
diff --git a/lib/local-execution/test/CMakeLists.txt b/lib/local-execution/test/CMakeLists.txt
index a973c6967b..0e79376575 100644
--- a/lib/local-execution/test/CMakeLists.txt
+++ b/lib/local-execution/test/CMakeLists.txt
@@ -11,11 +11,6 @@ ff_add_test_executable(
     local-execution
     kernels
     op-attrs
+    task-spec
 )
 
-set(FF_TEST_EXEC_NAME "local-execution-tests")
-add_custom_command(
-  TARGET ${FF_TEST_EXEC_NAME} POST_BUILD
-  COMMAND ${CMAKE_COMMAND} -DFF_TEST_EXEC_NAME=${FF_TEST_EXEC_NAME} -P ${CMAKE_CURRENT_LIST_DIR}/modify_test_commands.cmake
-  DEPENDS ${FF_TEST_EXEC_NAME}
-)
diff --git a/lib/local-execution/test/modify_test_commands.cmake b/lib/local-execution/test/modify_test_commands.cmake
deleted file mode 100644
index 6494ae2d78..0000000000
--- a/lib/local-execution/test/modify_test_commands.cmake
+++ /dev/null
@@ -1,21 +0,0 @@
-# modify_test_commands.cmake
-
-file(GLOB ctest_tests_files "${CMAKE_CURRENT_BINARY_DIR}/${FF_TEST_EXEC_NAME}_tests-*.cmake")
-
-foreach(ctest_tests_file IN LISTS ctest_tests_files)
-  file(READ "${ctest_tests_file}" content)
-
-  # add nix run prefix
-  string(REGEX REPLACE 
-    "add_test\\([ \t\r\n]*\\[==\\[([^]]+)\\]==\\][ \t\r\n]+([^ ]+)[ \t\r\n]+\\[==\\[([^]]+)\\]==\\]\\)" 
-    "add_test( [==[\\1]==] nixGL -- \\2 [==[\\3]==])" 
-    content "${content}")
-
-  # add environment
-  # string(REGEX REPLACE 
-  #   "set_tests_properties\\([ \t\r\n]*\\[==\\[([^]]+)\\]==\\][ \t\r\n]+PROPERTIES[ \t\r\n]+([^)]+)\\)" 
-  #   "set_tests_properties( [==[\\1]==] PROPERTIES \\2 ENVIRONMENT \"NIXPKGS_ALLOW_UNFREE=1\")" 
-  #   content "${content}")
-
-  file(WRITE "${ctest_tests_file}" "${content}")
-endforeach()
diff --git a/lib/local-execution/test/src/test_allocated_tensors.cc b/lib/local-execution/test/src/test_allocated_tensors.cc
index 45fc8e0a1c..3242ca79ad 100644
--- a/lib/local-execution/test/src/test_allocated_tensors.cc
+++ b/lib/local-execution/test/src/test_allocated_tensors.cc
@@ -1,6 +1,6 @@
+#include "kernels/local_cpu_allocator.h"
 #include "local-execution/allocated_tensors.h"
 #include "local-execution/gradient_tensor_source.h"
-#include "local-execution/local_cpu_allocator.h"
 #include "local-execution/loss_tensor_source.h"
 #include "local-execution/optimizer_tensor_source.h"
 #include "pcg/computation_graph.dtg.h"
@@ -29,16 +29,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     tensor_guid_t dangling_tensor = tensor_guid_source.new_mock_tensor_guid();
 
     TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{
-        TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 10_n}},
-                    DataType::FLOAT},
+        TensorShape{TensorDims{FFOrdered{16_p, 10_p}}, DataType::FLOAT},
         CreateGrad::NO};
     TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{
-        TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 20_n}},
-                    DataType::FLOAT},
+        TensorShape{TensorDims{FFOrdered{16_p, 20_p}}, DataType::FLOAT},
         CreateGrad::NO};
     TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{
-        TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 30_n}},
-                    DataType::FLOAT},
+        TensorShape{TensorDims{FFOrdered{16_p, 30_p}}, DataType::FLOAT},
         CreateGrad::YES};
 
     GenericTensorAccessorW tensor_backing_1 =
diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc
index ccad60a900..de759e2e01 100644
--- a/lib/local-execution/test/src/test_e2e.cc
+++ b/lib/local-execution/test/src/test_e2e.cc
@@ -1,6 +1,7 @@
-#include "kernels/local_cuda_allocator.h"
-#include "kernels/managed_ff_stream.h"
-#include "kernels/managed_per_device_ff_handle.h"
+#include "kernels/compare_tensor_accessors.h"
+#include "kernels/format_accessor_contents.h"
+#include "kernels/tensor_accessor_reductions.h"
+#include "kernels/test_utils.h"
 #include "local-execution/allocated_tensors.h"
 #include "local-execution/local_training_backing.h"
 #include "local-execution/model_training_instance.h"
@@ -14,20 +15,21 @@
 
 using namespace ::FlexFlow;
 
-bool did_loss_decrease(float *first_epoch, float *last_epoch, int batch_size) {
-  for (int i = 0; i < batch_size; i++) {
-    if (first_epoch[i] < last_epoch[i]) {
-      return false;
-    }
-  }
-  return true;
+bool did_loss_decrease(GenericTensorAccessorR const &first_epoch,
+                       GenericTensorAccessorR const &last_epoch) {
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+  return tensor_accessor_all(
+      compare_tensor_accessors_le(last_epoch, first_epoch, cpu_allocator));
 }
 
-TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("E2ETest") {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("LocalBackend e2e Training") {
     // initialize runtime
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
@@ -35,36 +37,38 @@ TEST_SUITE(FF_TEST_SUITE) {
     LossTensorSource loss_tensor_source;
     loss_tensor_t label_tensor = loss_tensor_source.new_loss_tensor();
 
-    nonnegative_int batch_size = 10_n;
-    nonnegative_int data_dim = 16_n;
-    nonnegative_int hidden_dim = 32_n;
-    nonnegative_int output_dim = 1_n;
+    positive_int batch_size = 10_p;
+    positive_int data_dim = 16_p;
+    positive_int hidden_dim = 32_p;
+    positive_int output_dim = 1_p;
 
+    TensorShape input_tensor_shape = TensorShape{
+        TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
     TensorShape output_tensor_shape = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{batch_size, output_dim}},
-        DataType::FLOAT};
+        TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
 
-    GenericTensorAccessorW label_tensor_backing =
-        allocator.allocate_tensor(output_tensor_shape);
-    AllocatedTensors allocated_tensors = AllocatedTensors{
-        {{TensorTypeVariant{label_tensor}, label_tensor_backing}}, {}, {}};
+    GenericTensorAccessorW label_tensor_backing = create_random_filled_accessor_w(
+        output_tensor_shape, allocator);
 
     // construct computation graph
     ComputationGraph computation_graph = make_empty_computation_graph();
 
-    TensorShape input_tensor_shape = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{batch_size, data_dim}},
-        DataType::FLOAT};
 
     TensorShape weight_shape_1 = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{data_dim, hidden_dim}},
-        DataType::FLOAT};
+        TensorDims{FFOrdered{data_dim, hidden_dim}}, DataType::FLOAT};
     TensorShape weight_shape_2 = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{hidden_dim, output_dim}},
-        DataType::FLOAT};
+        TensorDims{FFOrdered{hidden_dim, output_dim}}, DataType::FLOAT};
+
+    GenericTensorAccessorW weight_1_backing = create_random_filled_accessor_w(
+        weight_shape_1, allocator);
+    GenericTensorAccessorW weight_2_backing = create_random_filled_accessor_w(
+        weight_shape_2, allocator);
 
     LayerAddedResult inputs_layer =
         add_input_layer_with_grad(computation_graph, input_tensor_shape);
+    tensor_guid_t input_tensor_guid = get_only(inputs_layer.outputs);
+    GenericTensorAccessorW input_tensor_backing = create_random_filled_accessor_w(
+        input_tensor_shape, allocator);
 
     LayerAddedResult weights_layer_1 = add_layer(
         computation_graph,
@@ -73,6 +77,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                    std::nullopt},
         {},
         {});
+    tensor_guid_t weight_1_tensor_guid = get_only(weights_layer_1.outputs);
 
     LayerAddedResult weights_layer_2 = add_layer(
         computation_graph,
@@ -81,13 +86,14 @@ TEST_SUITE(FF_TEST_SUITE) {
                    std::nullopt},
         {},
         {});
+    tensor_guid_t weight_2_tensor_guid = get_only(weights_layer_2.outputs);
 
     LayerAddedResult linear_operator_1 = add_layer(
         computation_graph,
         LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{hidden_dim,
                                                        /*use_bias=*/false,
                                                        DataType::FLOAT,
-                                                       Activation::RELU,
+                                                       std::nullopt,
                                                        std::nullopt}},
                    std::nullopt},
         inputs_layer.outputs,
@@ -98,7 +104,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim,
                                                        /*use_bias=*/false,
                                                        DataType::FLOAT,
-                                                       Activation::RELU,
+                                                       std::nullopt,
                                                        std::nullopt}},
                    std::nullopt},
         linear_operator_1.outputs,
@@ -123,6 +129,17 @@ TEST_SUITE(FF_TEST_SUITE) {
     GradientTensorSource gradient_tensor_source;
     OptimizerTensorSource optimizer_tensor_source;
 
+    AllocatedTensors allocated_tensors = AllocatedTensors{
+        /*tensor_type_backings=*/{
+            {TensorTypeVariant{label_tensor}, label_tensor_backing},
+            {TensorTypeVariant{input_tensor_guid}, input_tensor_backing},
+            {TensorTypeVariant{weight_1_tensor_guid}, weight_1_backing},
+            {TensorTypeVariant{weight_2_tensor_guid}, weight_2_backing},
+        },
+        /*gradient_mapping=*/{},
+        /*optimizer_mapping*/ {},
+    };
+
     LocalTrainingBacking local_training_backing =
         LocalTrainingBacking{allocator,
                              allocated_tensors,
@@ -141,28 +158,25 @@ TEST_SUITE(FF_TEST_SUITE) {
                               loss_attrs,
                               optimizer_attrs};
 
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
     int num_epochs = 5;
-    int num_samples = batch_size.unwrap_nonnegative();
-    std::vector<float *> loss_values(num_epochs);
+    std::vector<GenericTensorAccessorR> loss_values;
 
     for (int i = 0; i < num_epochs; i++) {
       model_training_instance.forward();
       model_training_instance.backward();
       model_training_instance.update();
-      float *host_loss_ptr = new float[num_samples];
-      model_training_instance.write_loss_tensor_to_host(host_loss_ptr);
-      loss_values[i] = host_loss_ptr;
+      loss_values.push_back(copy_tensor_accessor_r(
+          model_training_instance.get_loss_tensor_accessor(), cpu_allocator));
     }
 
     // Assert that each sample in the batch has a lower loss in last epoch than
     // the first epoch
-    float *first_epoch = loss_values[0];
-    float *last_epoch = loss_values[num_epochs - 1];
-    CHECK(did_loss_decrease(
-        first_epoch, last_epoch, batch_size.unwrap_nonnegative()));
+    GenericTensorAccessorR first_epoch_loss = loss_values.at(0);
+    
+    GenericTensorAccessorR last_epoch = loss_values.back();
 
-    for (int i = 0; i < num_epochs; i++) {
-      delete[] loss_values[i];
-    }
+    CHECK(did_loss_decrease(first_epoch_loss, last_epoch));
   }
-}
\ No newline at end of file
+}
diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc
index c9c5afe04e..42b88aa6bc 100644
--- a/lib/local-execution/test/src/test_local_cost_estimator.cc
+++ b/lib/local-execution/test/src/test_local_cost_estimator.cc
@@ -9,10 +9,11 @@
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("LocalCostEstimator") {
-    // local backing initialization
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
 
     RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
         DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
@@ -23,8 +24,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     LocalCostEstimator cost_estimator = LocalCostEstimator{runtime_arg_config};
 
     SUBCASE("Estimate cost -- Attention Op") {
-      nonnegative_int embed_dim = 32_n;
-      nonnegative_int num_heads = 10_n;
+      positive_int embed_dim = 32_p;
+      positive_int num_heads = 10_p;
       MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{
           /*embed_dim=*/embed_dim,
           /*num_heads=*/num_heads,
@@ -36,14 +37,14 @@ TEST_SUITE(FF_TEST_SUITE) {
           /*add_zero_attn=*/false,
       };
 
-      nonnegative_int batch_size = 40_n;
-      nonnegative_int seq_len = 48_n;
-      nonnegative_int feature_size = 36_n;
+      positive_int batch_size = 40_p;
+      positive_int seq_len = 48_p;
+      positive_int feature_size = 36_p;
 
       DataType dtype = DataType::FLOAT;
       ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{
           TensorDims{
-              FFOrdered<nonnegative_int>{batch_size, seq_len, feature_size}},
+              FFOrdered<positive_int>{batch_size, seq_len, feature_size}},
           DataType::FLOAT,
       });
 
@@ -66,7 +67,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           make_1d_machine_view(
               MachineSpaceCoordinate{0_n, 0_n, DeviceType::GPU},
               MachineSpecificationDimension::INTRA_NODE,
-              stride_t{0_n}));
+              stride_t{1_p}));
 
       CHECK(result.total_elapsed_time > 0);
       CHECK(result.total_mem_usage > 0);
diff --git a/lib/local-execution/test/src/test_local_task_arg_accessor.cc b/lib/local-execution/test/src/test_local_task_arg_accessor.cc
index f7e9da08ed..5c11010e2a 100644
--- a/lib/local-execution/test/src/test_local_task_arg_accessor.cc
+++ b/lib/local-execution/test/src/test_local_task_arg_accessor.cc
@@ -1,7 +1,7 @@
 #include "doctest/doctest.h"
-#include "local-execution/local_cpu_allocator.h"
+#include "kernels/local_cpu_allocator.h"
 #include "local-execution/local_task_argument_accessor.h"
-#include "local-execution/task_signature_impl.h"
+#include "task-spec/task_signature_impl.h"
 #include "utils/fmt/variant.h"
 
 using namespace ::FlexFlow;
@@ -9,17 +9,16 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("LocalTaskArgumentAccessor") {
     Allocator allocator = create_local_cpu_memory_allocator();
-    nonnegative_int embed_dim = 32_n;
-    nonnegative_int num_heads = 10_n;
+    positive_int embed_dim = 32_p;
+    positive_int num_heads = 10_p;
 
-    nonnegative_int batch_size = 40_n;
-    nonnegative_int seq_len = 48_n;
-    nonnegative_int feature_size = 36_n;
+    positive_int batch_size = 40_p;
+    positive_int seq_len = 48_p;
+    positive_int feature_size = 36_p;
 
     DataType dtype = DataType::FLOAT;
     TensorShape input_tensor_shape = TensorShape{
-        TensorDims{
-            FFOrdered<nonnegative_int>{batch_size, seq_len, feature_size}},
+        TensorDims{FFOrdered{batch_size, seq_len, feature_size}},
         DataType::FLOAT,
     };
 
diff --git a/lib/local-execution/test/src/test_local_tensor_backing.cc b/lib/local-execution/test/src/test_local_tensor_backing.cc
index 594051c2f1..bba0bd28ce 100644
--- a/lib/local-execution/test/src/test_local_tensor_backing.cc
+++ b/lib/local-execution/test/src/test_local_tensor_backing.cc
@@ -1,4 +1,4 @@
-#include "local-execution/local_cpu_allocator.h"
+#include "kernels/local_cpu_allocator.h"
 #include "local-execution/local_tensor_backing.h"
 #include "test_utils.h"
 #include "utils/containers/keys.h"
@@ -94,12 +94,10 @@ TEST_SUITE(FF_TEST_SUITE) {
           tensor_guid_source.new_mock_tensor_guid();
 
       TensorAttrs allocated_tensor_attrs = TensorAttrs{
-          TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 10_n}},
-                      DataType::FLOAT},
+          TensorShape{TensorDims{FFOrdered{16_p, 10_p}}, DataType::FLOAT},
           CreateGrad::NO};
       TensorAttrs unallocated_tensor_attrs = TensorAttrs{
-          TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 20_n}},
-                      DataType::FLOAT},
+          TensorShape{TensorDims{FFOrdered{16_p, 20_p}}, DataType::FLOAT},
           CreateGrad::YES};
 
       GenericTensorAccessorW allocated_tensor_backing =
diff --git a/lib/local-execution/test/src/test_loss_functions.cc b/lib/local-execution/test/src/test_loss_functions.cc
index ca2482653b..d741d4d8d4 100644
--- a/lib/local-execution/test/src/test_loss_functions.cc
+++ b/lib/local-execution/test/src/test_loss_functions.cc
@@ -13,11 +13,13 @@
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("LossFunctions") {
     // initialize runtime
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
@@ -28,16 +30,14 @@ TEST_SUITE(FF_TEST_SUITE) {
     loss_tensor_t label_for_sparse_cce_loss_attrs =
         loss_tensor_source.new_loss_tensor();
 
-    nonnegative_int batch_size = 10_n;
-    nonnegative_int data_dim = 16_n;
-    nonnegative_int output_dim = 32_n;
+    positive_int batch_size = 10_p;
+    positive_int data_dim = 16_p;
+    positive_int output_dim = 32_p;
 
     TensorShape output_tensor_shape = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{batch_size, output_dim}},
-        DataType::FLOAT};
+        TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
     TensorShape reduced_tensor_shape =
-        TensorShape{TensorDims{FFOrdered<nonnegative_int>{batch_size, 1_n}},
-                    DataType::FLOAT};
+        TensorShape{TensorDims{FFOrdered{batch_size, 1_p}}, DataType::FLOAT};
 
     GenericTensorAccessorW label_for_nonconfigurable_loss_attrs_backing =
         allocator.allocate_tensor(output_tensor_shape);
@@ -55,12 +55,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     ComputationGraph computation_graph = make_empty_computation_graph();
 
     TensorShape input_tensor_shape = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{batch_size, data_dim}},
-        DataType::FLOAT};
+        TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
 
     TensorShape weight_shape = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{data_dim, output_dim}},
-        DataType::FLOAT};
+        TensorDims{FFOrdered{data_dim, output_dim}}, DataType::FLOAT};
 
     LayerAddedResult inputs_layer =
         add_input_layer(computation_graph, input_tensor_shape);
diff --git a/lib/local-execution/test/src/test_task_registry.cc b/lib/local-execution/test/src/test_task_registry.cc
index 16877b0e09..4bcfa7fe17 100644
--- a/lib/local-execution/test/src/test_task_registry.cc
+++ b/lib/local-execution/test/src/test_task_registry.cc
@@ -1,8 +1,8 @@
 #include "doctest/doctest.h"
 #include "kernels/local_cuda_allocator.h"
 #include "local-execution/local_cost_estimator.h"
-#include "local-execution/task_signature_impl.h"
 #include "pcg/computation_graph_builder.h"
+#include "task-spec/task_signature_impl.h"
 #include "utils/fmt/optional.h"
 #include "utils/fmt/unordered_map.h"
 
@@ -12,8 +12,8 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("TaskRegistry") {
 
     layer_guid_t layer_guid = layer_guid_t{Node{0}};
-    nonnegative_int embed_dim = 32_n;
-    nonnegative_int num_heads = 10_n;
+    positive_int embed_dim = 32_p;
+    positive_int num_heads = 10_p;
     ComputationGraphOpAttrs attrs =
         ComputationGraphOpAttrs{MultiHeadAttentionAttrs{
             /*embed_dim=*/embed_dim,
@@ -80,7 +80,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
     SUBCASE("different attrs, still same task fn mapping") {
       layer_guid_t layer_1 = layer_guid_t{Node{1}};
-      nonnegative_int embed_dim = 100_n;
+      positive_int embed_dim = 100_p;
       layer_guid_t layer_2 = layer_guid_t{Node{2}};
       ComputationGraphOpAttrs other_attrs =
           ComputationGraphOpAttrs{MultiHeadAttentionAttrs{
@@ -112,7 +112,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("equality") {
       SUBCASE("different attrs is still equal") {
-        nonnegative_int embed_dim = 100_n;
+        positive_int embed_dim = 100_p;
         ComputationGraphOpAttrs other_attrs =
             ComputationGraphOpAttrs{MultiHeadAttentionAttrs{
                 /*embed_dim=*/embed_dim,
diff --git a/lib/local-execution/test/src/test_unallocated_tensors.cc b/lib/local-execution/test/src/test_unallocated_tensors.cc
index 82f5a132fe..0a0b99e61c 100644
--- a/lib/local-execution/test/src/test_unallocated_tensors.cc
+++ b/lib/local-execution/test/src/test_unallocated_tensors.cc
@@ -1,6 +1,6 @@
+#include "kernels/local_cpu_allocator.h"
 #include "local-execution/allocated_tensors.h"
 #include "local-execution/gradient_tensor_source.h"
-#include "local-execution/local_cpu_allocator.h"
 #include "local-execution/loss_tensor_source.h"
 #include "local-execution/optimizer_tensor_source.h"
 #include "local-execution/unallocated_tensors.h"
@@ -38,16 +38,13 @@ TEST_SUITE(FF_TEST_SUITE) {
         optimizer_tensor_source.new_optimizer_tensor();
 
     TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{
-        TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 10_n}},
-                    DataType::FLOAT},
+        TensorShape{TensorDims{FFOrdered{16_p, 10_p}}, DataType::FLOAT},
         CreateGrad::NO};
     TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{
-        TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 20_n}},
-                    DataType::FLOAT},
+        TensorShape{TensorDims{FFOrdered{16_p, 20_p}}, DataType::FLOAT},
         CreateGrad::NO};
     TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{
-        TensorShape{TensorDims{FFOrdered<nonnegative_int>{16_n, 30_n}},
-                    DataType::FLOAT},
+        TensorShape{TensorDims{FFOrdered{16_p, 30_p}}, DataType::FLOAT},
         CreateGrad::YES};
 
     GenericTensorAccessorW tensor_backing_1 =
diff --git a/lib/local-execution/test/src/test_update.cc b/lib/local-execution/test/src/test_update.cc
index 75ba517d1b..54c64e6b6c 100644
--- a/lib/local-execution/test/src/test_update.cc
+++ b/lib/local-execution/test/src/test_update.cc
@@ -11,11 +11,13 @@
 
 using namespace ::FlexFlow;
 
-TEST_SUITE(FF_TEST_SUITE) {
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("ExecuteUpdate") {
     // initialize runtime configs
     ManagedFFStream managed_stream{};
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
 
     Allocator allocator = create_local_cuda_memory_allocator();
     AllocatedTensors allocated_tensors = make_empty_allocated_tensors();
@@ -23,17 +25,15 @@ TEST_SUITE(FF_TEST_SUITE) {
     // construct computation graph
     ComputationGraph computation_graph = make_empty_computation_graph();
 
-    nonnegative_int batch_size = 10_n;
-    nonnegative_int data_dim = 16_n;
-    nonnegative_int output_dim = 32_n;
+    positive_int batch_size = 10_p;
+    positive_int data_dim = 16_p;
+    positive_int output_dim = 32_p;
 
     TensorShape input_tensor_shape = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{batch_size, data_dim}},
-        DataType::FLOAT};
+        TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
 
     TensorShape weight_shape = TensorShape{
-        TensorDims{FFOrdered<nonnegative_int>{data_dim, output_dim}},
-        DataType::FLOAT};
+        TensorDims{FFOrdered{data_dim, output_dim}}, DataType::FLOAT};
 
     LayerAddedResult inputs_layer =
         add_input_layer(computation_graph, input_tensor_shape);

From d5a57ba5f549a0dc420cae6d9e90f7128dd48476 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 18 Jun 2025 12:05:48 -0700
Subject: [PATCH 86/91] feat: test e2e for realm-backend

---
 .proj.toml                                    |   6 +-
 .../realm-backend/model_training_instance.h   |   2 +-
 .../include/realm-backend/realm_allocator.h   |   4 +-
 .../realm_task_argument_accessor.h            |   2 +-
 .../realm_tensor_backing.struct.toml          |   3 -
 .../src/model_training_instance.cc            |  21 +++-
 lib/realm-backend/src/realm_allocator.cc      |   4 +
 .../src/realm_task_argument_accessor.cc       |   7 +-
 .../src/realm_training_backing.cc             |   2 +-
 lib/realm-backend/test/src/test_e2e.cc        | 105 ++++++++++--------
 10 files changed, 92 insertions(+), 64 deletions(-)

diff --git a/.proj.toml b/.proj.toml
index 56faaec75d..20a10c98da 100644
--- a/.proj.toml
+++ b/.proj.toml
@@ -72,8 +72,10 @@ has-cuda-benchmarks = false
 
 [targets.realm-backend]
 type = "lib"
-tests = false
-benchmarks = false
+has-cpu-only-tests = true
+has-cpu-only-benchmarks = false
+has-cuda-tests = true
+has-cuda-benchmarks = false
 
 [targets.models]
 type = "lib"
diff --git a/lib/realm-backend/include/realm-backend/model_training_instance.h b/lib/realm-backend/include/realm-backend/model_training_instance.h
index 049836d042..b1580b0305 100644
--- a/lib/realm-backend/include/realm-backend/model_training_instance.h
+++ b/lib/realm-backend/include/realm-backend/model_training_instance.h
@@ -28,7 +28,7 @@ struct ModelTrainingInstance {
   PerLayerElapsedTime forward();
   PerLayerElapsedTime backward();
   void update();
-  void write_loss_tensor_to_host(float *host_ptr);
+  GenericTensorAccessorR get_loss_tensor_accessor() const;
 };
 
 } // namespace FlexFlow
diff --git a/lib/realm-backend/include/realm-backend/realm_allocator.h b/lib/realm-backend/include/realm-backend/realm_allocator.h
index 304ca38e32..2c6c854837 100644
--- a/lib/realm-backend/include/realm-backend/realm_allocator.h
+++ b/lib/realm-backend/include/realm-backend/realm_allocator.h
@@ -8,8 +8,6 @@
 
 namespace FlexFlow {
 
-struct RealmAllocatorImpl;
-
 struct RealmAllocatorImpl : public IAllocator {
   RealmAllocatorImpl() = delete;
   RealmAllocatorImpl(RealmAllocatorImpl const &) = delete;
@@ -20,6 +18,8 @@ struct RealmAllocatorImpl : public IAllocator {
   void *allocate(size_t) override;
   void deallocate(void *) override;
 
+  DeviceType get_allocation_device_type() const override;
+
 private:
   std::unordered_map<void *, Realm::RegionInstance> ptrs;
   Realm::Processor proc;
diff --git a/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h b/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h
index 256e69c301..0e83a3de6f 100644
--- a/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h
+++ b/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h
@@ -1,9 +1,9 @@
 #ifndef _FLEXFLOW_REALM_BACKEND_REALM_TASK_ARGUMENT_ACCESSOR_H
 #define _FLEXFLOW_REALM_BACKEND_REALM_TASK_ARGUMENT_ACCESSOR_H
 
-#include "local-execution/task_argument_accessor.h"
 #include "realm-backend/realm_allocator.h"
 #include "task-spec/slot_tensor_type_id.dtg.h"
+#include "task-spec/task_argument_accessor.h"
 #include <unordered_map>
 #include <variant>
 
diff --git a/lib/realm-backend/include/realm-backend/realm_tensor_backing.struct.toml b/lib/realm-backend/include/realm-backend/realm_tensor_backing.struct.toml
index 92a074e4fc..d53071dd0e 100644
--- a/lib/realm-backend/include/realm-backend/realm_tensor_backing.struct.toml
+++ b/lib/realm-backend/include/realm-backend/realm_tensor_backing.struct.toml
@@ -3,7 +3,6 @@ name = "RealmTensorBacking"
 features = [
   "eq",
   "fmt",
-  "hash"
 ]
 
 includes = [
@@ -15,9 +14,7 @@ includes = [
 ]
 
 src_includes = [
-  "utils/hash/unordered_map.h",
   "utils/fmt/unordered_map.h",
-  "utils/hash/vector.h",
   "utils/fmt/vector.h",
 ]
 
diff --git a/lib/realm-backend/src/model_training_instance.cc b/lib/realm-backend/src/model_training_instance.cc
index 0c318f8942..87b8121bd5 100644
--- a/lib/realm-backend/src/model_training_instance.cc
+++ b/lib/realm-backend/src/model_training_instance.cc
@@ -1,6 +1,7 @@
 #include "pcg/computation_graph.h"
 #include "pcg/optimizer_attrs.h"
 #include "realm-backend/model_training_instance.h"
+#include "kernels/format_accessor_contents.h"
 #include "utils/containers/reversed.h"
 
 namespace FlexFlow {
@@ -39,6 +40,13 @@ PerLayerElapsedTime ModelTrainingInstance::backward() {
                this->logit_tensor,
                this->label_tensor);
 
+  gradient_tensor_t loss_tensor =
+      this->training_backing.realm_tensor_backing.tensor_gradient_mapping.at(
+          this->logit_tensor);
+  GenericTensorAccessorW loss_tensor_backing =
+      this->training_backing.realm_tensor_backing.tensor_backings.at(
+          TensorTypeVariant{loss_tensor});
+
   PerLayerElapsedTime per_layer_elapsed_time;
   std::unordered_map<layer_guid_t, Future<float>>
       per_layer_elapsed_time_future;
@@ -73,14 +81,19 @@ void ModelTrainingInstance::update() {
     this->optimizer_attrs);
 }
 
-void ModelTrainingInstance::write_loss_tensor_to_host(float *host_ptr) {
+GenericTensorAccessorR ModelTrainingInstance::get_loss_tensor_accessor() const {
+  GenericTensorAccessorW logit_tensor_backing = this->training_backing
+      .realm_tensor_backing.tensor_backings.at(TensorTypeVariant{this->logit_tensor});
+
+
   gradient_tensor_t loss_tensor =
-      this->training_backing.realm_tensor_backing
-          .tensor_gradient_mapping.at(this->logit_tensor);
+      this->training_backing.realm_tensor_backing.tensor_gradient_mapping.at(
+          this->logit_tensor);
   GenericTensorAccessorW loss_tensor_backing =
       this->training_backing.realm_tensor_backing.tensor_backings.at(
           TensorTypeVariant{loss_tensor});
-  write_to_host_float_ptr(loss_tensor_backing, host_ptr);
+  
+  return read_only_accessor_from_write_accessor(loss_tensor_backing);
 }
 
 } // namespace FlexFlow
diff --git a/lib/realm-backend/src/realm_allocator.cc b/lib/realm-backend/src/realm_allocator.cc
index d7139210bc..d8c60f375f 100644
--- a/lib/realm-backend/src/realm_allocator.cc
+++ b/lib/realm-backend/src/realm_allocator.cc
@@ -36,6 +36,10 @@ void RealmAllocatorImpl::deallocate(void *ptr) {
   }
 }
 
+DeviceType RealmAllocatorImpl::get_allocation_device_type() const {
+  return DeviceType::GPU;
+}
+
 Allocator create_realm_memory_allocator(Processor proc) {
   return Allocator::create<RealmAllocatorImpl>(proc);
 }
diff --git a/lib/realm-backend/src/realm_task_argument_accessor.cc b/lib/realm-backend/src/realm_task_argument_accessor.cc
index c7e81da01d..b7f10772e0 100644
--- a/lib/realm-backend/src/realm_task_argument_accessor.cc
+++ b/lib/realm-backend/src/realm_task_argument_accessor.cc
@@ -24,8 +24,8 @@ GenericTensorAccessor RealmTaskArgumentAccessor::get_tensor(
   auto tensor_backing = std::get<GenericTensorAccessorW>(
       this->tensor_slots_backing.at(slot_tensor_type));
   if (priv == Permissions::RO) {
-    GenericTensorAccessorR readonly_tensor_backing = {
-        tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr};
+    GenericTensorAccessorR readonly_tensor_backing =
+        read_only_accessor_from_write_accessor(tensor_backing);
     return readonly_tensor_backing;
   } else if (priv == Permissions::RW || priv == Permissions::WO) {
     return tensor_backing;
@@ -33,6 +33,7 @@ GenericTensorAccessor RealmTaskArgumentAccessor::get_tensor(
     throw mk_runtime_error(fmt::format("Unhandled privilege mode {}", priv));
   }
 }
+
 VariadicGenericTensorAccessor RealmTaskArgumentAccessor::get_variadic_tensor(
     slot_id_t slot, Permissions priv, TensorType tensor_type) const {
   SlotTensorTypeId slot_tensor_type = SlotTensorTypeId{slot, tensor_type};
@@ -43,7 +44,7 @@ VariadicGenericTensorAccessor RealmTaskArgumentAccessor::get_variadic_tensor(
     for (GenericTensorAccessorW const &tensor_backing :
          variadic_tensor_backing) {
       readonly_variadic_tensor_backing.push_back(
-          {tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr});
+          read_only_accessor_from_write_accessor(tensor_backing));
     }
     return readonly_variadic_tensor_backing;
   } else if (priv == Permissions::RW || priv == Permissions::WO) {
diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc
index 4e36bf8d5c..053bf62838 100644
--- a/lib/realm-backend/src/realm_training_backing.cc
+++ b/lib/realm-backend/src/realm_training_backing.cc
@@ -1,7 +1,6 @@
 #include "kernels/allocation.h"
 #include "local-execution/loss_functions.h"
 #include "local-execution/optimizer.h"
-#include "local-execution/task_signature_impl.h"
 #include "pcg/computation_graph.dtg.h"
 #include "pcg/computation_graph.h"
 #include "pcg/optimizer_attrs.h"
@@ -9,6 +8,7 @@
 #include "task-spec/op_task_to_task_invocation.h"
 #include "task-spec/runtime_arg_config.h"
 #include "task-spec/task_invocation.h"
+#include "task-spec/task_signature_impl.h"
 #include "utils/containers/contains.h"
 #include "utils/containers/contains_key.h"
 #include "utils/containers/get_only.h"
diff --git a/lib/realm-backend/test/src/test_e2e.cc b/lib/realm-backend/test/src/test_e2e.cc
index ba180494c3..ea8ca883bd 100644
--- a/lib/realm-backend/test/src/test_e2e.cc
+++ b/lib/realm-backend/test/src/test_e2e.cc
@@ -1,5 +1,7 @@
-#include "kernels/managed_ff_stream.h"
-#include "kernels/managed_per_device_ff_handle.h"
+#include "kernels/compare_tensor_accessors.h"
+#include "kernels/format_accessor_contents.h"
+#include "kernels/tensor_accessor_reductions.h"
+#include "kernels/test_utils.h"
 #include "local-execution/allocated_tensors.h"
 #include "realm-backend/realm_allocator.h"
 #include "realm-backend/realm_training_backing.h"
@@ -14,20 +16,21 @@
 using namespace ::FlexFlow;
 using namespace Realm;
 
-bool did_loss_decrease(float *first_epoch, float *last_epoch, int batch_size) {
-  for (int i = 0; i < batch_size; i++) {
-    if (first_epoch[i] < last_epoch[i]) {
-      return false;
-    }
-  }
-  return true;
+bool did_loss_decrease(GenericTensorAccessorR const &first_epoch,
+                       GenericTensorAccessorR const &last_epoch) {
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+  return tensor_accessor_all(
+      compare_tensor_accessors_le(last_epoch, first_epoch, cpu_allocator));
 }
 
 void top_level_task(const void *args, size_t arglen, const void *userdata,
                     size_t userlen, Realm::Processor p) {
   // initialize runtime
   ManagedFFStream managed_stream{};
-  ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
+  ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+      /*workSpaceSize=*/1024 * 1024,
+      /*allowTensorOpMathConversion=*/true);
   std::vector<Processor> worker_procs;
   std::vector<Allocator> allocators;
   Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine())
@@ -42,36 +45,37 @@ void top_level_task(const void *args, size_t arglen, const void *userdata,
   LossTensorSource loss_tensor_source;
   loss_tensor_t label_tensor = loss_tensor_source.new_loss_tensor();
 
-  nonnegative_int batch_size = 10_n;
-  nonnegative_int data_dim = 16_n;
-  nonnegative_int hidden_dim = 32_n;
-  nonnegative_int output_dim = 1_n;
+  positive_int batch_size = 10_p;
+  positive_int data_dim = 16_p;
+  positive_int hidden_dim = 32_p;
+  positive_int output_dim = 1_p;
 
+  TensorShape input_tensor_shape = TensorShape{
+      TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
   TensorShape output_tensor_shape = TensorShape{
-      TensorDims{FFOrdered<nonnegative_int>{batch_size, output_dim}},
-      DataType::FLOAT};
+      TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
 
-  GenericTensorAccessorW label_tensor_backing =
-      allocators[0].allocate_tensor(output_tensor_shape);
-  AllocatedTensors allocated_tensors = AllocatedTensors{
-      {{TensorTypeVariant{label_tensor}, label_tensor_backing}}, {}, {}};
+  GenericTensorAccessorW label_tensor_backing = create_random_filled_accessor_w(
+      output_tensor_shape, allocators[0]);
 
   // construct computation graph
   ComputationGraph computation_graph = make_empty_computation_graph();
 
-  TensorShape input_tensor_shape = TensorShape{
-      TensorDims{FFOrdered<nonnegative_int>{batch_size, data_dim}},
-      DataType::FLOAT};
-
   TensorShape weight_shape_1 = TensorShape{
-      TensorDims{FFOrdered<nonnegative_int>{data_dim, hidden_dim}},
-      DataType::FLOAT};
+      TensorDims{FFOrdered{data_dim, hidden_dim}}, DataType::FLOAT};
   TensorShape weight_shape_2 = TensorShape{
-      TensorDims{FFOrdered<nonnegative_int>{hidden_dim, output_dim}},
-      DataType::FLOAT};
+      TensorDims{FFOrdered{hidden_dim, output_dim}}, DataType::FLOAT};
+
+  GenericTensorAccessorW weight_1_backing = create_random_filled_accessor_w(
+      weight_shape_1, allocators[0]);
+  GenericTensorAccessorW weight_2_backing = create_random_filled_accessor_w(
+      weight_shape_2, allocators[0]);
 
   LayerAddedResult inputs_layer =
       add_input_layer_with_grad(computation_graph, input_tensor_shape);
+  tensor_guid_t input_tensor_guid = get_only(inputs_layer.outputs);
+  GenericTensorAccessorW input_tensor_backing = create_random_filled_accessor_w(
+      input_tensor_shape, allocators[0]);
 
   LayerAddedResult weights_layer_1 = add_layer(
       computation_graph,
@@ -80,6 +84,7 @@ void top_level_task(const void *args, size_t arglen, const void *userdata,
                   std::nullopt},
       {},
       {});
+  tensor_guid_t weight_1_tensor_guid = get_only(weights_layer_1.outputs);
 
   LayerAddedResult weights_layer_2 = add_layer(
       computation_graph,
@@ -88,13 +93,14 @@ void top_level_task(const void *args, size_t arglen, const void *userdata,
                   std::nullopt},
       {},
       {});
+  tensor_guid_t weight_2_tensor_guid = get_only(weights_layer_2.outputs);
 
   LayerAddedResult linear_operator_1 = add_layer(
       computation_graph,
       LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{hidden_dim,
                                                       /*use_bias=*/false,
                                                       DataType::FLOAT,
-                                                      Activation::RELU,
+                                                      std::nullopt,
                                                       std::nullopt}},
                   std::nullopt},
       inputs_layer.outputs,
@@ -105,7 +111,7 @@ void top_level_task(const void *args, size_t arglen, const void *userdata,
       LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim,
                                                       /*use_bias=*/false,
                                                       DataType::FLOAT,
-                                                      Activation::RELU,
+                                                      std::nullopt,
                                                       std::nullopt}},
                   std::nullopt},
       linear_operator_1.outputs,
@@ -130,6 +136,17 @@ void top_level_task(const void *args, size_t arglen, const void *userdata,
   GradientTensorSource gradient_tensor_source;
   OptimizerTensorSource optimizer_tensor_source;
 
+  AllocatedTensors allocated_tensors = AllocatedTensors{
+      /*tensor_type_backings=*/{
+          {TensorTypeVariant{label_tensor}, label_tensor_backing},
+          {TensorTypeVariant{input_tensor_guid}, input_tensor_backing},
+          {TensorTypeVariant{weight_1_tensor_guid}, weight_1_backing},
+          {TensorTypeVariant{weight_2_tensor_guid}, weight_2_backing},
+      },
+      /*gradient_mapping=*/{},
+      /*optimizer_mapping*/ {},
+  };
+
   {
     printf("\nRunning test %d: E2ETest...\n", 1);
     RealmTrainingBacking realm_training_backing = RealmTrainingBacking(
@@ -141,32 +158,26 @@ void top_level_task(const void *args, size_t arglen, const void *userdata,
       realm_training_backing, logit_tensor, label_tensor, loss_attrs, optimizer_attrs
     };
 
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
     int num_epochs = 5;
-    int num_samples = batch_size.unwrap_nonnegative();
-    std::vector<float *> loss_values(num_epochs);
+    std::vector<GenericTensorAccessorR> loss_values;
 
     for (int i = 0; i < num_epochs; i++) {
       model_training_instance.forward();
       model_training_instance.backward();
       model_training_instance.update();
-      float *host_loss_ptr = new float[num_samples];
-      model_training_instance.write_loss_tensor_to_host(host_loss_ptr);
-      loss_values[i] = host_loss_ptr;
+      loss_values.push_back(copy_tensor_accessor_r(
+          model_training_instance.get_loss_tensor_accessor(), cpu_allocator));
     }
 
     // Assert that each sample in the batch has a lower loss in last epoch than
     // the first epoch
-    float *first_epoch = loss_values[0];
-    float *last_epoch = loss_values[num_epochs - 1];
-    if(did_loss_decrease(
-        first_epoch, last_epoch, batch_size.unwrap_nonnegative())) {
-      printf("passed\n");
-    } else {
-      printf("failed\n");
-    }
+    GenericTensorAccessorR first_epoch_loss = loss_values.at(0);
+    
+    GenericTensorAccessorR last_epoch = loss_values.back();
 
-    for (int i = 0; i < num_epochs; i++) {
-      delete[] loss_values[i];
-    }
+    assert(did_loss_decrease(first_epoch_loss, last_epoch));
+    printf("passed\n");
   }
-}
\ No newline at end of file
+}

From 32971ef0c652ab4c5d3236470003d496791b48a5 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 18 Jun 2025 19:13:28 -0700
Subject: [PATCH 87/91] tweak: minor

---
 lib/local-execution/src/local_cost_estimator.cc | 2 --
 lib/local-execution/test/src/test_e2e.cc        | 4 ++--
 lib/realm-backend/test/src/test_e2e.cc          | 4 ++--
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
index 0a84c19066..85f315c7d1 100644
--- a/lib/local-execution/src/local_cost_estimator.cc
+++ b/lib/local-execution/src/local_cost_estimator.cc
@@ -95,10 +95,8 @@ CostDetails LocalCostEstimator::estimate_cost(
 
   float fwd =
       execute_forward(local_backing, operator_layer_guid, allocator).value();
-  std::cout << "completed forward" << std::endl;
   float bwd =
       execute_backward(local_backing, operator_layer_guid, allocator).value();
-  std::cout << "completed  backward" << std::endl;
 
   float total_execution_time = fwd + bwd;
 
diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc
index de759e2e01..f1c83e76a0 100644
--- a/lib/local-execution/test/src/test_e2e.cc
+++ b/lib/local-execution/test/src/test_e2e.cc
@@ -93,7 +93,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{hidden_dim,
                                                        /*use_bias=*/false,
                                                        DataType::FLOAT,
-                                                       std::nullopt,
+                                                       Activation::RELU,
                                                        std::nullopt}},
                    std::nullopt},
         inputs_layer.outputs,
@@ -104,7 +104,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim,
                                                        /*use_bias=*/false,
                                                        DataType::FLOAT,
-                                                       std::nullopt,
+                                                       Activation::RELU,
                                                        std::nullopt}},
                    std::nullopt},
         linear_operator_1.outputs,
diff --git a/lib/realm-backend/test/src/test_e2e.cc b/lib/realm-backend/test/src/test_e2e.cc
index ea8ca883bd..fa0976991d 100644
--- a/lib/realm-backend/test/src/test_e2e.cc
+++ b/lib/realm-backend/test/src/test_e2e.cc
@@ -100,7 +100,7 @@ void top_level_task(const void *args, size_t arglen, const void *userdata,
       LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{hidden_dim,
                                                       /*use_bias=*/false,
                                                       DataType::FLOAT,
-                                                      std::nullopt,
+                                                      Activation::RELU,
                                                       std::nullopt}},
                   std::nullopt},
       inputs_layer.outputs,
@@ -111,7 +111,7 @@ void top_level_task(const void *args, size_t arglen, const void *userdata,
       LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim,
                                                       /*use_bias=*/false,
                                                       DataType::FLOAT,
-                                                      std::nullopt,
+                                                      Activation::RELU,
                                                       std::nullopt}},
                   std::nullopt},
       linear_operator_1.outputs,

From a1a8c14583a709e8f48a575230cbc2e500557f8d Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Wed, 18 Jun 2025 20:33:03 -0700
Subject: [PATCH 88/91] tweak: minor

---
 lib/realm-backend/src/realm_allocator.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/lib/realm-backend/src/realm_allocator.cc b/lib/realm-backend/src/realm_allocator.cc
index d8c60f375f..287de0f2d5 100644
--- a/lib/realm-backend/src/realm_allocator.cc
+++ b/lib/realm-backend/src/realm_allocator.cc
@@ -21,6 +21,11 @@ void *RealmAllocatorImpl::allocate(size_t requested_memory_size) {
   RegionInstance::create_instance(requested_instance, mem, bounds, field_sizes,
                                   /*SOA*/ 1, ProfilingRequestSet())
       .wait();
+  // TODO: looks like no need to do this because the memory is already zeroed out
+  // char *zero_data = new char[requested_memory_size];
+  // memset(zero_data, 0, requested_memory_size);
+  // requested_instance.write_untyped(0, (const void *)zero_data, requested_memory_size);
+  // delete[] zero_data;
   void *ptr = requested_instance.pointer_untyped(0, 0);
   this->ptrs.insert({ptr, requested_instance});
   return ptr;

From 4e3fb7d72282b4258f8faee4d024b525239ce57b Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Sun, 3 Aug 2025 19:27:42 -0700
Subject: [PATCH 89/91] feat: reconstrcut realm backend

---
 .flake/pkgs/ffdb/ffdb.py                      |   1 +
 .../src/export_model_arch.cc                  |  10 +-
 flake.lock                                    |  18 +-
 flake.nix                                     |   4 +-
 .../compiler/cost_estimator/cost_estimator.h  |   4 +-
 .../cost_estimator/op_cost_estimate_key.h     |  11 +-
 .../op_cost_estimate_key.struct.toml          |   5 +
 .../compiler/cost_estimator/op_cost_metrics.h |  15 +
 .../op_cost_metrics.struct.toml               |  11 +-
 .../runtime_only_cost_estimator.h             |  52 ++
 ..._only_cost_estimator_from_cost_estimator.h |  28 +
 .../runtime_only_op_cost_estimate_key.h       |  18 +
 ...time_only_op_cost_estimate_key.struct.toml |  40 ++
 .../runtime_only_op_cost_metrics.h            |  14 +
 .../runtime_only_op_cost_metrics.struct.toml  |  19 +
 ...easible_machine_mapping_result.struct.toml |   3 +-
 .../get_optimal_machine_mapping.h             |  12 +-
 .../machine_mapping_context.struct.toml       |   8 +-
 .../machine_mapping_problem_tree.h            |  11 +-
 .../machine_mapping_problem_tree.variant.toml |   4 +-
 .../unmapped_op_cost_estimate_key.h           |  13 +-
 .../unmapped_op_cost_estimate_key.struct.toml |   5 +-
 ...mapped_runtime_only_op_cost_estimate_key.h |  22 +
 ...time_only_op_cost_estimate_key.struct.toml |  34 ++
 .../machine_mapping/machine_mapping_result.h  |   5 +-
 .../get_optimal_machine_mapping_with_memory.h |  13 +-
 ...ne_mapping_with_memory_context.struct.toml |  23 +
 .../machine_mapping_with_memory_result.h      |   2 +-
 ...hine_mapping_with_memory_state.struct.toml |  30 +
 .../pcg_task.variant.toml                     |   4 +-
 .../task_graph_simulator/task_simulator.h     |   7 +-
 .../compiler/cost_estimator/cost_estimator.cc |   2 +-
 .../cost_estimator/op_cost_estimate_key.cc    |  33 +-
 .../cost_estimator/op_cost_metrics.cc         |  16 +
 .../runtime_only_cost_estimator.cc            |  19 +
 ...only_cost_estimator_from_cost_estimator.cc |  45 ++
 .../runtime_only_op_cost_estimate_key.cc      |  17 +
 .../runtime_only_op_cost_metrics.cc           |  14 +
 .../get_optimal_machine_mapping.cc            |  28 +-
 .../get_machine_mapping_problem_tree.cc       |   5 +-
 .../machine_mapping_problem_tree.cc           |  19 +-
 .../unmapped_op_cost_estimate_key.cc          |  39 +-
 ...apped_runtime_only_op_cost_estimate_key.cc |  39 ++
 .../machine_mapping/machine_mapping_result.cc |   4 +-
 ...get_optimal_machine_mapping_with_memory.cc |  22 +-
 .../machine_mapping_with_memory_result.cc     |  18 +-
 .../task_graph_simulator/pcg_task_graph.cc    |   6 +-
 .../task_graph_execution_trace.cc             |   6 +-
 .../task_graph_simulator/task_simulator.cc    |  23 +-
 .../get_optimal_machine_mapping.cc            | 126 +++--
 .../get_tensor_set_movement_across_split.cc   |   2 +-
 .../get_machine_mapping_problem_tree.cc       |  54 +-
 .../machine_mapping/machine_mapping_result.cc |  16 +-
 ...get_optimal_machine_mapping_with_memory.cc | 102 ++--
 .../machine_mapping_result_with_memory.cc     |  71 +--
 .../get_pcg_series_parallel_decomposition.cc  |   2 +-
 .../task_graph_simulator/task_simulator.cc    | 190 ++++---
 .../cost_estimator_for_test.cc                |  18 +-
 .../cost_estimator_for_test.h                 |  17 +-
 .../runtime_only_cost_estimator_for_test.cc   |  52 ++
 .../runtime_only_cost_estimator_for_test.h    |  26 +
 lib/kernels/include/kernels/accessor.h        | 107 ++--
 lib/kernels/include/kernels/allocation.h      |   4 +-
 lib/kernels/include/kernels/array_coord.h     |  14 -
 lib/kernels/include/kernels/array_shape.h     |  79 ---
 .../include/kernels/attention_kernels.h       | 116 ++--
 .../include/kernels/attention_kernels_cpu.h   |  31 ++
 .../include/kernels/attention_kernels_gpu.h   |  52 ++
 .../include/kernels/batch_matmul_kernels.h    |  12 +-
 .../kernels/batch_matmul_kernels_cpu.h        |  32 ++
 .../kernels/batch_matmul_kernels_gpu.h        |  38 ++
 .../include/kernels/batch_norm_kernels.h      |  39 +-
 .../include/kernels/batch_norm_kernels_cpu.h  |  28 +
 .../include/kernels/batch_norm_kernels_gpu.h  |  43 ++
 lib/kernels/include/kernels/cast_kernels.h    |  10 +-
 .../include/kernels/cast_kernels_cpu.h        |   5 +-
 .../include/kernels/cast_kernels_gpu.h        |  19 +
 lib/kernels/include/kernels/combine_kernels.h |  19 -
 .../include/kernels/combine_kernels_cpu.h     |  17 -
 lib/kernels/include/kernels/concat_kernels.h  |   6 +-
 .../include/kernels/concat_kernels_cpu.h      |  19 +
 .../include/kernels/concat_kernels_gpu.h      |  21 +
 lib/kernels/include/kernels/conv_2d_kernels.h |  99 ++--
 .../include/kernels/conv_2d_kernels_cpu.h     |  26 +
 .../include/kernels/conv_2d_kernels_gpu.h     |  44 ++
 .../conv_2d_per_device_state.struct.toml      |  48 ++
 .../include/kernels/copy_tensor_accessor.h    |   2 +-
 .../kernels/create_accessor_with_contents.h   |  17 +-
 .../create_local_allocator_for_device_type.h  |  12 +
 lib/kernels/include/kernels/device_handle_t.h |  17 +
 .../kernels/device_handle_t.variant.toml      |  16 +
 lib/kernels/include/kernels/device_stream_t.h |  15 +
 .../kernels/device_stream_t.variant.toml      |  16 +
 lib/kernels/include/kernels/dropout_kernels.h |  83 +--
 .../include/kernels/dropout_kernels_cpu.h     |  12 +
 .../include/kernels/dropout_kernels_gpu.h     |  33 ++
 .../dropout_per_device_state.struct.toml      |  40 ++
 .../include/kernels/element_binary_kernels.h  | 102 ++--
 .../kernels/element_binary_kernels_cpu.h      |  25 +
 .../kernels/element_binary_kernels_gpu.h      |  43 ++
 ...lement_binary_per_device_state.struct.toml |  32 ++
 .../include/kernels/element_unary_kernels.h   |  76 ++-
 .../kernels/element_unary_kernels_cpu.h       |  22 +
 .../kernels/element_unary_kernels_gpu.h       |  36 ++
 ...element_unary_per_device_state.struct.toml |  19 +
 .../include/kernels/embedding_kernels.h       |  16 +-
 .../include/kernels/embedding_kernels_cpu.h   |  31 ++
 .../include/kernels/embedding_kernels_gpu.h   |  33 ++
 lib/kernels/include/kernels/ff_handle.h       |  23 +-
 .../include/kernels/fill_tensor_accessor.h    |   2 +-
 lib/kernels/include/kernels/flat_kernels.h    |  10 +-
 .../include/kernels/flat_kernels_cpu.h        |  16 +
 .../include/kernels/flat_kernels_gpu.h        |  20 +
 lib/kernels/include/kernels/gather_kernels.h  |  39 +-
 .../include/kernels/gather_kernels_cpu.h      |  18 +
 .../include/kernels/gather_kernels_gpu.h      |  27 +
 .../gather_per_device_state.struct.toml       |  16 +
 .../include/kernels/layer_norm_kernels.h      |  93 ++--
 .../include/kernels/layer_norm_kernels_cpu.h  |  22 +
 .../include/kernels/layer_norm_kernels_gpu.h  |  39 ++
 .../layer_norm_per_device_state.struct.toml   |  57 ++
 lib/kernels/include/kernels/legion_dim.h      |   4 +
 .../kernels/legion_ordered/legion_ordered.h   |   8 +-
 lib/kernels/include/kernels/linear_kernels.h  | 107 ++--
 .../include/kernels/linear_kernels_cpu.h      |  29 +
 .../include/kernels/linear_kernels_gpu.h      |  49 ++
 .../linear_per_device_state.struct.toml       |  56 ++
 .../include/kernels/loss_function_kernels.h   |  25 +-
 .../kernels/loss_function_kernels_cpu.h       |  41 ++
 .../kernels/loss_function_kernels_gpu.h       |  45 ++
 .../kernels/managed_per_device_ff_handle.h    |   7 +
 .../include/kernels/map_tensor_accessors.h    | 107 ++--
 .../kernels/mha_per_device_state.struct.toml  |  65 +++
 .../include/kernels/optimizer_kernels.h       |  79 +--
 .../include/kernels/optimizer_kernels_cpu.h   |  31 ++
 .../include/kernels/optimizer_kernels_gpu.h   |  59 ++
 .../include/kernels/partition_kernels.h       |  34 --
 .../partition_per_device_state.struct.toml    |  16 +
 .../kernels/per_device_op_state.variant.toml  |  82 ---
 lib/kernels/include/kernels/pool_2d_kernels.h | 108 ++--
 .../include/kernels/pool_2d_kernels_cpu.h     |  15 +
 .../include/kernels/pool_2d_kernels_gpu.h     |  46 ++
 .../pool_2d_per_device_state.struct.toml      |  32 ++
 lib/kernels/include/kernels/profiling.h       |  74 ++-
 lib/kernels/include/kernels/reduce_kernels.h  |  57 +-
 .../include/kernels/reduce_kernels_cpu.h      |  12 +
 .../include/kernels/reduce_kernels_gpu.h      |  30 +
 .../reduce_per_device_state.struct.toml       |  33 ++
 .../include/kernels/reduce_tensor_accessor.h  |  29 +-
 .../include/kernels/reduction_kernels.h       |  20 -
 .../include/kernels/replicate_kernels.h       |  20 -
 .../include/kernels/replicate_kernels_cpu.h   |  18 -
 lib/kernels/include/kernels/reshape_kernels.h |  24 +-
 .../include/kernels/reshape_kernels_cpu.h     |  16 +
 .../include/kernels/reshape_kernels_gpu.h     |  19 +
 lib/kernels/include/kernels/reverse_kernels.h |   9 +-
 .../include/kernels/reverse_kernels_cpu.h     |   1 -
 .../include/kernels/reverse_kernels_gpu.h     |  22 +
 .../include/kernels/reverse_kernels_params.h  |   4 +-
 lib/kernels/include/kernels/softmax_kernels.h |  58 +-
 .../include/kernels/softmax_kernels_cpu.h     |  16 +
 .../include/kernels/softmax_kernels_gpu.h     |  32 ++
 .../softmax_per_device_state.struct.toml      |  21 +
 lib/kernels/include/kernels/split_kernels.h   |  20 +-
 .../include/kernels/split_kernels_cpu.h       |  22 +
 .../include/kernels/split_kernels_gpu.h       |  26 +
 .../kernels/tensor_accessor_binary_ops.h      |  48 ++
 .../kernels/tensor_accessor_unary_ops.h       |  50 ++
 lib/kernels/include/kernels/topk_kernels.h    |  25 +-
 .../include/kernels/topk_kernels_cpu.h        |  25 +
 .../include/kernels/topk_kernels_gpu.h        |  27 +
 .../include/kernels/transpose_kernels.h       |  14 +-
 .../include/kernels/transpose_kernels_cpu.h   |  19 +
 .../include/kernels/transpose_kernels_gpu.h   |  22 +
 lib/kernels/src/cpu/ops/combine_kernels.cc    |  40 --
 .../src/cpu/ops/initializer_kernels.cc        |   9 +-
 lib/kernels/src/cpu/ops/replicate_kernels.cc  |  53 --
 lib/kernels/src/cuda/cuda_helper.cu           |  52 +-
 lib/kernels/src/cuda/embedding_kernels.cu     | 128 +++--
 lib/kernels/src/cuda/loss_function_kernels.cu |  44 +-
 lib/kernels/src/cuda/ops/attention_kernels.cu | 143 ++---
 .../src/cuda/ops/batch_matmul_kernels.cu      |  50 +-
 .../src/cuda/ops/batch_norm_kernels.cu        |  69 ++-
 lib/kernels/src/cuda/ops/cast_kernels.cu      |  22 +-
 lib/kernels/src/cuda/ops/combine_kernels.cu   |  68 ---
 lib/kernels/src/cuda/ops/concat_kernels.cu    |  40 +-
 lib/kernels/src/cuda/ops/conv_2d_kernels.cu   | 119 ++--
 lib/kernels/src/cuda/ops/dropout_kernels.cu   |  64 +--
 .../src/cuda/ops/element_binary_kernels.cu    |  80 +--
 .../src/cuda/ops/element_unary_kernels.cu     |  73 +--
 lib/kernels/src/cuda/ops/flat_kernels.cu      |  33 +-
 lib/kernels/src/cuda/ops/gather_kernels.cu    |  87 +--
 .../src/cuda/ops/layer_norm_kernels.cu        |  72 +--
 lib/kernels/src/cuda/ops/linear_kernels.cu    | 152 ++---
 lib/kernels/src/cuda/ops/partition_kernels.cu |  79 ---
 lib/kernels/src/cuda/ops/pool_2d_kernels.cu   |  70 ++-
 lib/kernels/src/cuda/ops/reduce_kernels.cu    |  42 +-
 lib/kernels/src/cuda/ops/reduction_kernels.cu |  86 ---
 lib/kernels/src/cuda/ops/replicate_kernels.cu |  84 ---
 lib/kernels/src/cuda/ops/reshape_kernels.cu   |  61 +-
 lib/kernels/src/cuda/ops/reverse_kernels.cu   |  22 +-
 lib/kernels/src/cuda/ops/softmax_kernels.cu   |  40 +-
 lib/kernels/src/cuda/ops/split_kernels.cu     |  30 +-
 lib/kernels/src/cuda/ops/topk_kernels.cu      |  39 +-
 lib/kernels/src/cuda/ops/transpose_kernels.cu |  54 +-
 lib/kernels/src/cuda/optimizer_kernels.cu     |  14 +-
 lib/kernels/src/ff_handle.cc                  |   1 +
 lib/kernels/src/internal/device.h             |   8 +-
 lib/kernels/src/kernels/accessor.cc           | 109 ++--
 lib/kernels/src/kernels/allocation.cc         |   7 +-
 lib/kernels/src/kernels/array_shape.cc        | 150 -----
 lib/kernels/src/kernels/attention_kernels.cc  | 125 +++++
 .../src/kernels/attention_kernels_cpu.cc      |  25 +
 .../src/kernels/batch_matmul_kernels.cc       |  93 ++++
 .../src/kernels/batch_matmul_kernels_cpu.cc   |  31 ++
 lib/kernels/src/kernels/batch_norm_kernels.cc | 107 ++++
 .../src/kernels/batch_norm_kernels_cpu.cc     |  25 +
 lib/kernels/src/kernels/cast_kernels.cc       |  39 ++
 .../cast_kernels_cpu.cc}                      |   8 +-
 lib/kernels/src/kernels/concat_kernels.cc     |  45 ++
 lib/kernels/src/kernels/concat_kernels_cpu.cc |  17 +
 lib/kernels/src/kernels/conv_2d_kernels.cc    | 118 ++++
 .../src/kernels/conv_2d_kernels_cpu.cc        |  25 +
 .../src/kernels/copy_tensor_accessor.cc       |  20 +-
 .../create_local_allocator_for_device_type.cc |  16 +
 lib/kernels/src/kernels/device_handle_t.cc    |  24 +
 lib/kernels/src/kernels/device_stream_t.cc    |  25 +
 lib/kernels/src/kernels/dropout_kernels.cc    |  79 +++
 .../src/kernels/dropout_kernels_cpu.cc        |  14 +
 .../src/kernels/element_binary_kernels.cc     | 116 ++++
 .../src/kernels/element_binary_kernels_cpu.cc |  25 +
 .../src/kernels/element_unary_kernels.cc      |  92 ++++
 .../src/kernels/element_unary_kernels_cpu.cc  |  19 +
 lib/kernels/src/kernels/embedding_kernels.cc  |  81 +++
 .../src/kernels/embedding_kernels_cpu.cc      |  29 +
 .../src/kernels/fill_tensor_accessor.cc       |  28 +-
 lib/kernels/src/kernels/flat_kernels.cc       |  42 ++
 lib/kernels/src/kernels/flat_kernels_cpu.cc   |  16 +
 .../src/kernels/format_accessor_contents.cc   | 145 ++++-
 lib/kernels/src/kernels/gather_kernels.cc     |  66 +++
 lib/kernels/src/kernels/gather_kernels_cpu.cc |  17 +
 lib/kernels/src/kernels/layer_norm_kernels.cc |  99 ++++
 .../src/kernels/layer_norm_kernels_cpu.cc     |  21 +
 lib/kernels/src/kernels/legion_dim.cc         |  14 +
 lib/kernels/src/kernels/linear_kernels.cc     | 148 +++++
 lib/kernels/src/kernels/linear_kernels_cpu.cc |  96 ++++
 .../src/kernels/loss_function_kernels.cc      | 126 +++++
 .../src/kernels/loss_function_kernels_cpu.cc  |  51 ++
 lib/kernels/src/kernels/optimizer_kernels.cc  |  98 ++++
 .../src/kernels/optimizer_kernels_cpu.cc      |  76 +++
 lib/kernels/src/kernels/pool_2d_kernels.cc    | 105 ++++
 .../src/kernels/pool_2d_kernels_cpu.cc        |  17 +
 lib/kernels/src/kernels/reduce_kernels.cc     |  62 +++
 lib/kernels/src/kernels/reduce_kernels_cpu.cc |  14 +
 lib/kernels/src/kernels/reshape_kernels.cc    |  39 ++
 .../src/kernels/reshape_kernels_cpu.cc        |  15 +
 lib/kernels/src/kernels/reverse_kernels.cc    |  33 ++
 .../reverse_kernels_cpu.cc}                   |  23 +-
 .../src/kernels/reverse_kernels_params.cc     |  20 +-
 lib/kernels/src/kernels/softmax_kernels.cc    |  79 +++
 .../src/kernels/softmax_kernels_cpu.cc        |  16 +
 lib/kernels/src/kernels/split_kernels.cc      |  63 +++
 lib/kernels/src/kernels/split_kernels_cpu.cc  |  24 +
 .../src/kernels/tensor_accessor_binary_ops.cc | 143 +++++
 .../src/kernels/tensor_accessor_reductions.cc |   4 +-
 .../src/kernels/tensor_accessor_unary_ops.cc  | 247 +++++++++
 lib/kernels/src/kernels/topk_kernels.cc       |  67 +++
 lib/kernels/src/kernels/topk_kernels_cpu.cc   |  25 +
 lib/kernels/src/kernels/transpose_kernels.cc  |  45 ++
 .../src/kernels/transpose_kernels_cpu.cc      |  17 +
 .../src/managed_per_device_ff_handle.cc       |  13 +
 .../test/src/cpu/ops/replicate_kernels.cc     |  59 --
 .../{src => test/src/internal}/test_utils.cc  |  39 +-
 .../src/internal}/test_utils.h                |   2 -
 lib/kernels/test/src/kernels/accessor.cc      |  57 +-
 lib/kernels/test/src/kernels/array_shape.cc   |  87 ---
 .../src/kernels/compare_tensor_accessors.cc   |   2 +-
 .../kernels/create_accessor_with_contents.cc  |   9 +-
 .../src/kernels/format_accessor_contents.cc   |  64 ++-
 lib/kernels/test/src/kernels/legion_dim.cc    |   2 +-
 .../src/kernels/legion_ordered/transform.cc   |   2 +-
 .../test/src/kernels/linear_kernels.cc        | 263 +++++++++
 .../test/src/kernels/linear_kernels_cpu.cc    | 175 ++++++
 .../test/src/kernels/map_tensor_accessors.cc  |  12 +-
 .../src/kernels/reduce_tensor_accessor.cc     |   2 +-
 .../reverse_kernels_cpu.cc}                   |   4 +-
 .../src/kernels/tensor_accessor_unary_ops.cc  | 178 ++++++
 lib/kernels/test/src/test_attention_kernel.cc |  16 +-
 .../test/src/test_batch_matmul_kernel.cc      |  56 +-
 .../test/src/test_batch_norm_kernel.cc        |  24 +-
 lib/kernels/test/src/test_cast_kernel.cc      |  17 +-
 lib/kernels/test/src/test_combine_kernel.cc   | 106 ----
 lib/kernels/test/src/test_concat_kernel.cc    |  24 +-
 lib/kernels/test/src/test_cuda.cc             |   2 +-
 lib/kernels/test/src/test_dropout.cc          |  38 +-
 lib/kernels/test/src/test_flat_kernel.cc      |  22 +-
 lib/kernels/test/src/test_gather_kernels.cc   |  45 +-
 .../test/src/test_layer_norm_kernels.cc       |  36 +-
 .../test/src/test_managed_ff_stream.cc        | 107 ----
 lib/kernels/test/src/test_partition_kernel.cc |  52 --
 lib/kernels/test/src/test_pool_2d_kernels.cc  |  30 +-
 lib/kernels/test/src/test_reduction_kernel.cc |  57 --
 lib/kernels/test/src/test_replicate_kernel.cc | 153 ------
 lib/kernels/test/src/test_reshape_kernel.cc   |  27 +-
 lib/kernels/test/src/test_reverse_kernels.cc  |  38 +-
 lib/kernels/test/src/test_softmax_kernel.cc   |  33 +-
 lib/kernels/test/src/test_split_kernel.cc     |  43 +-
 lib/kernels/test/src/test_transpose_kernel.cc |  18 +-
 lib/local-execution/CMakeLists.txt            |   1 +
 .../local-execution/allocated_tensors.h       |  32 --
 .../allocated_tensors.struct.toml             |  30 -
 .../local-execution/cost_details.struct.toml  |  18 -
 .../include/local-execution/cost_estimate.h   |  63 ---
 .../include/local-execution/cost_metrics.h    |  70 ---
 .../local-execution/gradient_tensor_source.h  |  22 -
 .../local-execution/local_args_backing.h      |  38 +-
 .../local_args_backing.struct.toml            |  18 +
 .../local-execution/local_cost_estimator.h    |  16 +-
 .../local_task_argument_accessor.h            |  22 +-
 .../local-execution/local_task_registry.h     |  24 +
 .../local_task_registry.struct.toml           |  26 +
 .../local-execution/local_tensor_backing.h    |  50 +-
 .../local_tensor_backing.struct.toml          |  18 +-
 .../local-execution/local_training_backing.h  |  90 ++-
 .../local_training_backing.struct.toml        |  26 +
 .../local-execution/model_training_instance.h |  13 +-
 .../local-execution/operator_task_set.h       |  24 +
 .../operator_task_set.struct.toml             |  24 +
 .../local-execution/optimizer_tensor_source.h |  22 -
 .../include/local-execution/registered_task.h |  12 +
 .../registered_task_t.variant.toml            |  27 +
 .../include/local-execution/task_registry.h   |  21 -
 .../local-execution/task_registry.struct.toml |  35 --
 .../tensor_slot_backing.variant.toml          |  23 +
 .../local-execution/tracked_allocator.h       |   3 +-
 .../local-execution/unallocated_tensors.h     |  27 -
 .../unallocated_tensors.struct.toml           |  31 --
 lib/local-execution/src/allocated_tensors.cc  | 145 -----
 .../src/local-execution/local_args_backing.cc |  62 +++
 .../local-execution/local_cost_estimator.cc   | 165 ++++++
 .../local-execution/local_task_registry.cc    |  64 +++
 .../local-execution/local_tensor_backing.cc   |  74 +++
 .../local-execution/local_training_backing.cc | 221 ++++++++
 .../model_training_instance.cc                |  85 +++
 .../src/local-execution/operator_task_set.cc  |  71 +++
 .../src/local-execution/registered_task.cc    |   9 +
 lib/local-execution/src/local_args_backing.cc |  46 --
 .../src/local_cost_estimator.cc               | 122 ----
 .../src/local_task_argument_accessor.cc       |  23 +-
 .../src/local_tensor_backing.cc               |  95 ----
 .../src/local_training_backing.cc             | 264 ---------
 lib/local-execution/src/loss_tensor_source.cc |  13 -
 .../src/model_training_instance.cc            |  80 ---
 lib/local-execution/src/task_binding.cc       |  58 +-
 lib/local-execution/src/task_registry.cc      |  78 ---
 lib/local-execution/src/tracked_allocator.cc  |   4 +-
 .../src/unallocated_tensors.cc                |  92 ----
 .../test/src/{ => internal}/test_utils.cc     |   2 +-
 .../test/src/{ => internal}/test_utils.h      |   0
 .../local-execution/local_cost_estimator.cc   | 142 +++++
 .../local_task_argument_accessor.cc}          |  47 +-
 .../local-execution/local_task_registry.cc    | 278 ++++++++++
 .../local-execution/local_tensor_backing.cc   | 285 ++++++++++
 .../local_training_backing.cc}                |  77 +--
 .../loss_functions.cc}                        | 127 +++--
 .../test/src/test_allocated_tensors.cc        | 226 --------
 lib/local-execution/test/src/test_e2e.cc      | 241 ++++++--
 .../test/src/test_local_cost_estimator.cc     |  76 ---
 .../test/src/test_local_tensor_backing.cc     | 146 -----
 .../test/src/test_task_registry.cc            | 216 --------
 .../test/src/test_unallocated_tensors.cc      | 440 ---------------
 lib/models/src/models/bert/bert.cc            |  29 +-
 .../src/models/candle_uno/candle_uno.cc       |   3 +-
 lib/models/src/models/dlrm/dlrm.cc            |   5 +-
 .../src/models/transformer/transformer.cc     |   4 +-
 lib/op-attrs/include/op-attrs/datatype.h      |   2 +-
 .../include/op-attrs/datatype_value.h         |   4 +
 .../op-attrs/datatype_value.variant.toml      |  13 +
 lib/op-attrs/include/op-attrs/ff_dim_t.h      |   4 +
 .../include/op-attrs/ff_ordered/ff_ordered.h  |  13 +-
 .../include/op-attrs/ff_ordered/filtrans.h    |  20 +
 .../include/op-attrs/ff_ordered/reversed.h    |  16 +
 .../include/op-attrs/ff_ordered/slice.h       |   5 +-
 .../include/op-attrs/ff_ordered/transform.h   |   3 +-
 .../include/op-attrs/ff_ordered/zip.h         |   3 +-
 .../include/op-attrs/ff_ordered/zip_with.cc   |  14 +
 .../include/op-attrs/ff_ordered/zip_with.h    |  22 +
 .../op-attrs/ops/gather_attrs.struct.toml     |   5 +-
 .../op-attrs/ops/layer_norm_attrs.struct.toml |  11 +-
 lib/op-attrs/include/op-attrs/tensor_dims.h   |  36 +-
 .../include/op-attrs/tensor_dims_coord.h      |  17 +
 .../op-attrs/tensor_dims_coord.struct.toml}   |   7 +-
 lib/op-attrs/include/op-attrs/tensor_shape.h  |  11 +-
 lib/op-attrs/src/op-attrs/datatype_value.cc   |  38 ++
 lib/op-attrs/src/op-attrs/ff_dim_t.cc         |   7 +
 .../src/op-attrs/ff_ordered/filtrans.cc       |  12 +
 .../src/op-attrs/ff_ordered/reversed.cc       |  10 +
 lib/op-attrs/src/op-attrs/ops/attention.cc    |   7 +-
 .../attention/multihead_attention_inputs.cc   |  31 +-
 lib/op-attrs/src/op-attrs/ops/batch_matmul.cc |  21 +-
 lib/op-attrs/src/op-attrs/ops/batch_norm.cc   |   6 +-
 lib/op-attrs/src/op-attrs/ops/broadcast.cc    |   4 +-
 lib/op-attrs/src/op-attrs/ops/concat.cc       |   4 +-
 lib/op-attrs/src/op-attrs/ops/conv_2d.cc      |   6 +-
 .../ops/conv_2d/conv_2d_input_shape.cc        |  21 +-
 lib/op-attrs/src/op-attrs/ops/embedding.cc    |   5 +-
 lib/op-attrs/src/op-attrs/ops/flat.cc         |   4 +-
 lib/op-attrs/src/op-attrs/ops/layer_norm.cc   |   6 +-
 lib/op-attrs/src/op-attrs/ops/linear.cc       |   8 +-
 lib/op-attrs/src/op-attrs/ops/pool_2d.cc      |  14 +-
 lib/op-attrs/src/op-attrs/ops/softmax.cc      |   3 +-
 .../src/op-attrs/parallel_tensor_dims.cc      |  10 +-
 lib/op-attrs/src/op-attrs/tensor_dims.cc      | 138 ++++-
 .../src/op-attrs/tensor_dims_coord.cc}        |  14 +-
 lib/op-attrs/src/op-attrs/tensor_shape.cc     |  26 +-
 .../test/src/op-attrs/datatype_value.cc       |  75 +++
 .../test/src/op-attrs/ff_ordered/concat.cc    |  18 +-
 .../test/src/op-attrs/ff_ordered/enumerate.cc |   2 +-
 .../ff_ordered/ff_ordered_from_map.cc         |   2 +-
 .../test/src/op-attrs/ff_ordered/reversed.cc  |  26 +
 .../test/src/op-attrs/ff_ordered/transform.cc |   2 +-
 .../test/src/op-attrs/ff_ordered/zip.cc       |  31 +-
 .../test/src/op-attrs/ff_ordered/zip_with.cc  |  80 +++
 .../test/src/op-attrs/ops/element_binary.cc   |   3 +-
 lib/op-attrs/test/src/op-attrs/ops/linear.cc  |  16 +-
 lib/op-attrs/test/src/op-attrs/tensor_dims.cc | 129 +++++
 .../test/src/op-attrs/tensor_dims_coord.cc}   |  21 +-
 .../cg_operator_plus_signature.struct.toml    |  23 +
 .../pcg/cg_operator_tensor_shape_signature.h  |  20 +
 ...perator_tensor_shape_signature.struct.toml |  32 ++
 .../include/pcg/computation_graph_builder.h   |   2 +-
 .../pcg/file_format/v1/data_type_value.h      |   2 +-
 lib/pcg/include/pcg/optimizer_attrs.h         |   3 +-
 .../pcg_operator_plus_signature.struct.toml   |  23 +
 ...perator_tensor_shape_signature.struct.toml |  31 ++
 .../include/pcg}/tensor_role.enum.toml        |   0
 .../pcg/cg_operator_tensor_shape_signature.cc |  28 +
 lib/pcg/src/pcg/computation_graph_builder.cc  |  46 +-
 lib/pcg/src/pcg/optimizer_attrs.cc            |  10 +-
 .../parallel_computation_graph.cc             |   9 +-
 lib/realm-backend/CMakeLists.txt              |   1 +
 .../realm-backend/model_training_instance.h   |  19 +-
 .../realm-backend/realm_args_backing.h        |  38 --
 .../realm_task_argument_accessor.h            |  47 --
 .../realm-backend/realm_tensor_backing.h      |  47 --
 .../realm_tensor_backing.struct.toml          |  31 --
 .../realm-backend/realm_training_backing.h    |  90 ++-
 .../include/realm-backend/task_result.h       |   1 -
 .../include/realm-backend/task_wrapper.h      |   3 +-
 .../src/model_training_instance.cc            | 132 +++--
 lib/realm-backend/src/realm_args_backing.cc   |  46 --
 .../src/realm_task_argument_accessor.cc       |  65 ---
 lib/realm-backend/src/realm_tensor_backing.cc |  94 ----
 .../src/realm_training_backing copy.cc        | 126 +++++
 .../src/realm_training_backing.cc             | 520 ++++++++----------
 lib/realm-backend/src/task_wrapper.cc         |   9 +-
 lib/realm-backend/test/src/test_e2e.cc        | 328 +++++------
 lib/realm-backend/test/src/test_update.cc     |  14 +-
 lib/runtime/src/ops/embedding.cc              | 120 ----
 .../{concrete_arg.h => concrete_arg_spec.h}   |   6 +-
 ...device_specific_device_states.variant.toml |  63 +--
 ...toml => forward_tensor_guid_t.struct.toml} |   2 +-
 .../include/task-spec/forward_tensor_source.h |  22 +
 ...oml => gradient_tensor_guid_t.struct.toml} |   2 +-
 .../task-spec/gradient_tensor_source.h        |  22 +
 .../task-spec/init_op_task_impl_function.h    |   7 +-
 .../task-spec/itask_argument_accessor.h       |   2 +-
 .../include/task-spec}/loss_functions.h       |  15 +-
 ...ct.toml => loss_tensor_guid_t.struct.toml} |   2 +-
 .../include/task-spec}/loss_tensor_source.h   |   8 +-
 lib/task-spec/include/task-spec/op_arg_ref.h  |   8 +-
 .../task-spec/op_arg_spec.variant.toml        |   2 +-
 .../include/task-spec/op_task_binding.h       |  97 ++++
 .../include/task-spec/op_task_invocation.h    | 109 +---
 .../task-spec/op_task_invocation.struct.toml  |  16 +
 .../task-spec/op_task_to_task_invocation.h    |  33 +-
 .../task-spec/op_tensor_slot_spec.struct.toml |   2 +-
 .../include/task-spec/op_tensor_spec.h        |  16 +-
 .../task-spec/op_tensor_spec.struct.toml      |  28 +
 lib/task-spec/include/task-spec/ops/combine.h |  23 -
 .../include/task-spec/ops/reduction.h         |  24 -
 .../include/task-spec/ops/repartition.h       |  26 -
 .../include/task-spec/ops/replicate.h         |  23 -
 lib/task-spec/include/task-spec/ops/reshape.h |   3 -
 lib/task-spec/include/task-spec/ops/topk.h    |   7 +-
 .../include/task-spec}/optimizer.h            |  24 +-
 ...ml => optimizer_tensor_guid_t.struct.toml} |   2 +-
 .../task-spec/optimizer_tensor_source.h       |  22 +
 ...parallel_tensor_shape_ref_type.struct.toml |  12 +-
 .../include/task-spec/per_device_op_state.h   |   6 +-
 .../per_device_op_state.variant.toml          |  64 +--
 lib/task-spec/include/task-spec/profiling.h   |  11 +-
 .../include/task-spec/runtime_arg_config.h    |  21 +-
 .../task-spec/runtime_arg_config.struct.toml  |  25 +
 .../include/task-spec/runtime_arg_ref.h       |  18 +-
 .../task-spec/runtime_arg_ref_type.enum.toml  |  17 +
 .../task-spec/task_arg_spec.variant.toml      |   2 +-
 .../task-spec/task_argument_accessor.h        |  14 +-
 .../include/task-spec/task_binding.h          |  39 +-
 .../include/task-spec/task_id_t.enum.toml     |   9 -
 .../include/task-spec/task_signature_impl.h   |  10 +-
 ....toml => tensor_sub_slot_id_t.struct.toml} |   2 +-
 .../task-spec/tensor_type_t.variant.toml      |  31 --
 .../task-spec/training_computation_graph.h    |  68 +++
 .../training_computation_graph.struct.toml    |  27 +
 .../task-spec/training_layer_plus_context.h   |  50 ++
 .../training_layer_plus_context.struct.toml   |  29 +
 .../training_layer_tensor_group_signature.h   |  20 +
 ...g_layer_tensor_group_signature.struct.toml |  19 +
 .../include/task-spec/training_tensor_group.h |  28 +
 .../training_tensor_group.struct.toml         |  31 ++
 .../training_tensor_group_with_attrs.h        |  18 +
 ...aining_tensor_group_with_attrs.struct.toml |  37 ++
 .../training_tensor_guid_t.variant.toml       |  31 ++
 .../{concrete_arg.cc => concrete_arg_spec.cc} |   2 +-
 .../src/task-spec/forward_tensor_source.cc    |  18 +
 .../src/task-spec}/gradient_tensor_source.cc  |   8 +-
 .../src/task-spec}/loss_functions.cc          |  78 +--
 .../src/task-spec/loss_tensor_source.cc       |  13 +
 lib/task-spec/src/task-spec/op_arg_ref.cc     |  27 +-
 .../task-spec/op_task_to_task_invocation.cc   | 214 ++++---
 lib/task-spec/src/task-spec/op_tensor_spec.cc |  12 +-
 lib/task-spec/src/task-spec/ops/attention.cc  | 111 ++--
 .../src/task-spec/ops/batch_matmul.cc         |  88 +--
 lib/task-spec/src/task-spec/ops/batch_norm.cc |  97 ++--
 lib/task-spec/src/task-spec/ops/cast.cc       |  26 +-
 lib/task-spec/src/task-spec/ops/combine.cc    |  94 ----
 lib/task-spec/src/task-spec/ops/concat.cc     |  31 +-
 lib/task-spec/src/task-spec/ops/conv_2d.cc    |  85 ++-
 lib/task-spec/src/task-spec/ops/dropout.cc    |  72 ++-
 .../src/task-spec/ops/element_binary.cc       |  71 ++-
 .../src/task-spec/ops/element_unary.cc        |  63 ++-
 lib/task-spec/src/task-spec/ops/embedding.cc  | 120 ++++
 lib/task-spec/src/task-spec/ops/flat.cc       |  26 +-
 lib/task-spec/src/task-spec/ops/gather.cc     |  81 ++-
 lib/task-spec/src/task-spec/ops/layer_norm.cc |  82 ++-
 lib/task-spec/src/task-spec/ops/linear.cc     | 158 +++---
 lib/task-spec/src/task-spec/ops/pool_2d.cc    |  83 ++-
 lib/task-spec/src/task-spec/ops/reduce.cc     |  62 ++-
 lib/task-spec/src/task-spec/ops/reduction.cc  | 101 ----
 .../src/task-spec/ops/repartition.cc          | 137 -----
 lib/task-spec/src/task-spec/ops/replicate.cc  |  99 ----
 lib/task-spec/src/task-spec/ops/reshape.cc    |  70 +--
 lib/task-spec/src/task-spec/ops/reverse.cc    |  25 +-
 lib/task-spec/src/task-spec/ops/softmax.cc    |  87 ++-
 lib/task-spec/src/task-spec/ops/split.cc      |  48 +-
 lib/task-spec/src/task-spec/ops/topk.cc       |  79 +--
 lib/task-spec/src/task-spec/ops/transpose.cc  |  28 +-
 .../src/task-spec}/optimizer.cc               |  85 +--
 .../src/task-spec}/optimizer_tensor_source.cc |   8 +-
 lib/task-spec/src/task-spec/profiling.cc      |   1 +
 .../src/task-spec/runtime_arg_config.cc       |  30 +
 .../src/task-spec/runtime_arg_ref.cc          |  20 +-
 .../src/task-spec/task_invocation.cc          |   6 +-
 .../src/task-spec/task_signature_impl.cc      |  83 +--
 .../task-spec/training_computation_graph.cc   | 183 ++++++
 .../task-spec/training_layer_plus_context.cc  | 122 ++++
 .../training_layer_tensor_group_signature.cc  |  31 ++
 .../src/task-spec/training_tensor_group.cc    |  48 ++
 .../training_tensor_group_with_attrs.cc       |  26 +
 .../src/task-spec/training_tensor_group.cc    |  36 ++
 .../training_tensor_group_with_attrs.cc       |  84 +++
 .../utils/archetypes/ordered_value_type.h     |  10 +
 .../include/utils/containers/all_are_true.h   |  17 +
 .../utils/containers/collapse_optionals.h     |  19 +
 .../include/utils/containers/contains_value.h |  33 ++
 .../include/utils/containers/filter_keys.h    |  12 +
 lib/utils/include/utils/containers/filtrans.h |   6 +-
 lib/utils/include/utils/containers/flatmap.h  |  11 +
 lib/utils/include/utils/exception.h           |   3 +-
 lib/utils/include/utils/fmt/half.h            |  26 +
 lib/utils/include/utils/fmt/set.h             |   3 +-
 lib/utils/include/utils/{fp16.h => half.h}    |   0
 lib/utils/include/utils/json/half.h           |  17 +
 .../utils/nonnegative_int/nonnegative_int.h   |   1 +
 .../utils/nonnegative_int/nonnegative_range.h |   2 +
 lib/utils/include/utils/rapidcheck/half.h     |  16 +
 .../include/utils/rapidcheck/monostate.h      |  16 +
 .../include/utils/units/milliseconds_t.h      |  67 +++
 lib/utils/include/utils/units/num_bytes_t.h   |  62 +++
 lib/utils/src/{fp16.cc => half.cc}            |   2 +-
 .../src/utils/containers/all_are_true.cc      |  10 +
 .../utils/containers/collapse_optionals.cc    |  11 +
 .../src/utils/containers/contains_value.cc    |  13 +
 lib/utils/src/utils/containers/filtrans.cc    |  11 +
 lib/utils/src/utils/fmt/half.cc               |   9 +
 lib/utils/src/utils/fmt/set.cc                |  15 +
 lib/utils/src/utils/json/half.cc              |  13 +
 .../utils/nonnegative_int/nonnegative_int.cc  |  33 +-
 .../nonnegative_int/nonnegative_range.cc      |   4 +
 lib/utils/src/utils/rapidcheck/half.cc        |   9 +
 lib/utils/src/utils/rapidcheck/monostate.cc   |   9 +
 lib/utils/src/utils/units/milliseconds_t.cc   |  94 ++++
 lib/utils/src/utils/units/num_bytes_t.cc      |  87 +++
 .../include/test/utils/doctest/fmt/half.h     |  16 +
 lib/utils/test/common/src/main.cc             |   2 +
 .../common/src/test/utils/doctest/fmt/half.cc |   9 +
 .../test/src/utils/containers/all_are_true.cc |  36 ++
 .../utils/containers/collapse_optionals.cc    |  38 ++
 .../src/utils/containers/contains_value.cc    |  51 ++
 .../src/utils/positive_int/positive_int.cc    |  28 +-
 601 files changed, 16269 insertions(+), 9723 deletions(-)
 create mode 100644 lib/compiler/include/compiler/cost_estimator/op_cost_metrics.h
 create mode 100644 lib/compiler/include/compiler/cost_estimator/runtime_only_cost_estimator.h
 create mode 100644 lib/compiler/include/compiler/cost_estimator/runtime_only_cost_estimator_from_cost_estimator.h
 create mode 100644 lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_estimate_key.h
 create mode 100644 lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_estimate_key.struct.toml
 create mode 100644 lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_metrics.h
 create mode 100644 lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_metrics.struct.toml
 create mode 100644 lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.h
 create mode 100644 lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.struct.toml
 create mode 100644 lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_context.struct.toml
 create mode 100644 lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_state.struct.toml
 create mode 100644 lib/compiler/src/compiler/cost_estimator/op_cost_metrics.cc
 create mode 100644 lib/compiler/src/compiler/cost_estimator/runtime_only_cost_estimator.cc
 create mode 100644 lib/compiler/src/compiler/cost_estimator/runtime_only_cost_estimator_from_cost_estimator.cc
 create mode 100644 lib/compiler/src/compiler/cost_estimator/runtime_only_op_cost_estimate_key.cc
 create mode 100644 lib/compiler/src/compiler/cost_estimator/runtime_only_op_cost_metrics.cc
 create mode 100644 lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.cc
 rename lib/compiler/test/src/{compiler => internal}/cost_estimator_for_test.cc (73%)
 rename lib/compiler/test/src/{compiler => internal}/cost_estimator_for_test.h (69%)
 create mode 100644 lib/compiler/test/src/internal/runtime_only_cost_estimator_for_test.cc
 create mode 100644 lib/compiler/test/src/internal/runtime_only_cost_estimator_for_test.h
 delete mode 100644 lib/kernels/include/kernels/array_coord.h
 delete mode 100644 lib/kernels/include/kernels/array_shape.h
 create mode 100644 lib/kernels/include/kernels/attention_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/attention_kernels_gpu.h
 create mode 100644 lib/kernels/include/kernels/batch_matmul_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/batch_matmul_kernels_gpu.h
 create mode 100644 lib/kernels/include/kernels/batch_norm_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/batch_norm_kernels_gpu.h
 create mode 100644 lib/kernels/include/kernels/cast_kernels_gpu.h
 delete mode 100644 lib/kernels/include/kernels/combine_kernels.h
 delete mode 100644 lib/kernels/include/kernels/combine_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/concat_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/concat_kernels_gpu.h
 create mode 100644 lib/kernels/include/kernels/conv_2d_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/conv_2d_kernels_gpu.h
 create mode 100644 lib/kernels/include/kernels/conv_2d_per_device_state.struct.toml
 create mode 100644 lib/kernels/include/kernels/create_local_allocator_for_device_type.h
 create mode 100644 lib/kernels/include/kernels/device_handle_t.h
 create mode 100644 lib/kernels/include/kernels/device_handle_t.variant.toml
 create mode 100644 lib/kernels/include/kernels/device_stream_t.h
 create mode 100644 lib/kernels/include/kernels/device_stream_t.variant.toml
 create mode 100644 lib/kernels/include/kernels/dropout_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/dropout_kernels_gpu.h
 create mode 100644 lib/kernels/include/kernels/dropout_per_device_state.struct.toml
 create mode 100644 lib/kernels/include/kernels/element_binary_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/element_binary_kernels_gpu.h
 create mode 100644 lib/kernels/include/kernels/element_binary_per_device_state.struct.toml
 create mode 100644 lib/kernels/include/kernels/element_unary_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/element_unary_kernels_gpu.h
 create mode 100644 lib/kernels/include/kernels/element_unary_per_device_state.struct.toml
 create mode 100644 lib/kernels/include/kernels/embedding_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/embedding_kernels_gpu.h
 create mode 100644 lib/kernels/include/kernels/flat_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/flat_kernels_gpu.h
 create mode 100644 lib/kernels/include/kernels/gather_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/gather_kernels_gpu.h
 create mode 100644 lib/kernels/include/kernels/gather_per_device_state.struct.toml
 create mode 100644 lib/kernels/include/kernels/layer_norm_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/layer_norm_kernels_gpu.h
 create mode 100644 lib/kernels/include/kernels/layer_norm_per_device_state.struct.toml
 create mode 100644 lib/kernels/include/kernels/linear_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/linear_kernels_gpu.h
 create mode 100644 lib/kernels/include/kernels/linear_per_device_state.struct.toml
 create mode 100644 lib/kernels/include/kernels/loss_function_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/loss_function_kernels_gpu.h
 create mode 100644 lib/kernels/include/kernels/mha_per_device_state.struct.toml
 create mode 100644 lib/kernels/include/kernels/optimizer_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/optimizer_kernels_gpu.h
 delete mode 100644 lib/kernels/include/kernels/partition_kernels.h
 create mode 100644 lib/kernels/include/kernels/partition_per_device_state.struct.toml
 delete mode 100644 lib/kernels/include/kernels/per_device_op_state.variant.toml
 create mode 100644 lib/kernels/include/kernels/pool_2d_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/pool_2d_kernels_gpu.h
 create mode 100644 lib/kernels/include/kernels/pool_2d_per_device_state.struct.toml
 create mode 100644 lib/kernels/include/kernels/reduce_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/reduce_kernels_gpu.h
 create mode 100644 lib/kernels/include/kernels/reduce_per_device_state.struct.toml
 delete mode 100644 lib/kernels/include/kernels/reduction_kernels.h
 delete mode 100644 lib/kernels/include/kernels/replicate_kernels.h
 delete mode 100644 lib/kernels/include/kernels/replicate_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/reshape_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/reshape_kernels_gpu.h
 create mode 100644 lib/kernels/include/kernels/reverse_kernels_gpu.h
 create mode 100644 lib/kernels/include/kernels/softmax_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/softmax_kernels_gpu.h
 create mode 100644 lib/kernels/include/kernels/softmax_per_device_state.struct.toml
 create mode 100644 lib/kernels/include/kernels/split_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/split_kernels_gpu.h
 create mode 100644 lib/kernels/include/kernels/tensor_accessor_binary_ops.h
 create mode 100644 lib/kernels/include/kernels/tensor_accessor_unary_ops.h
 create mode 100644 lib/kernels/include/kernels/topk_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/topk_kernels_gpu.h
 create mode 100644 lib/kernels/include/kernels/transpose_kernels_cpu.h
 create mode 100644 lib/kernels/include/kernels/transpose_kernels_gpu.h
 delete mode 100644 lib/kernels/src/cpu/ops/combine_kernels.cc
 delete mode 100644 lib/kernels/src/cpu/ops/replicate_kernels.cc
 delete mode 100644 lib/kernels/src/cuda/ops/combine_kernels.cu
 delete mode 100644 lib/kernels/src/cuda/ops/partition_kernels.cu
 delete mode 100644 lib/kernels/src/cuda/ops/reduction_kernels.cu
 delete mode 100644 lib/kernels/src/cuda/ops/replicate_kernels.cu
 delete mode 100644 lib/kernels/src/kernels/array_shape.cc
 create mode 100644 lib/kernels/src/kernels/attention_kernels.cc
 create mode 100644 lib/kernels/src/kernels/attention_kernels_cpu.cc
 create mode 100644 lib/kernels/src/kernels/batch_matmul_kernels.cc
 create mode 100644 lib/kernels/src/kernels/batch_matmul_kernels_cpu.cc
 create mode 100644 lib/kernels/src/kernels/batch_norm_kernels.cc
 create mode 100644 lib/kernels/src/kernels/batch_norm_kernels_cpu.cc
 create mode 100644 lib/kernels/src/kernels/cast_kernels.cc
 rename lib/kernels/src/{cpu/ops/cast_kernels.cc => kernels/cast_kernels_cpu.cc} (82%)
 create mode 100644 lib/kernels/src/kernels/concat_kernels.cc
 create mode 100644 lib/kernels/src/kernels/concat_kernels_cpu.cc
 create mode 100644 lib/kernels/src/kernels/conv_2d_kernels.cc
 create mode 100644 lib/kernels/src/kernels/conv_2d_kernels_cpu.cc
 create mode 100644 lib/kernels/src/kernels/create_local_allocator_for_device_type.cc
 create mode 100644 lib/kernels/src/kernels/device_handle_t.cc
 create mode 100644 lib/kernels/src/kernels/device_stream_t.cc
 create mode 100644 lib/kernels/src/kernels/dropout_kernels.cc
 create mode 100644 lib/kernels/src/kernels/dropout_kernels_cpu.cc
 create mode 100644 lib/kernels/src/kernels/element_binary_kernels.cc
 create mode 100644 lib/kernels/src/kernels/element_binary_kernels_cpu.cc
 create mode 100644 lib/kernels/src/kernels/element_unary_kernels.cc
 create mode 100644 lib/kernels/src/kernels/element_unary_kernels_cpu.cc
 create mode 100644 lib/kernels/src/kernels/embedding_kernels.cc
 create mode 100644 lib/kernels/src/kernels/embedding_kernels_cpu.cc
 create mode 100644 lib/kernels/src/kernels/flat_kernels.cc
 create mode 100644 lib/kernels/src/kernels/flat_kernels_cpu.cc
 create mode 100644 lib/kernels/src/kernels/gather_kernels.cc
 create mode 100644 lib/kernels/src/kernels/gather_kernels_cpu.cc
 create mode 100644 lib/kernels/src/kernels/layer_norm_kernels.cc
 create mode 100644 lib/kernels/src/kernels/layer_norm_kernels_cpu.cc
 create mode 100644 lib/kernels/src/kernels/linear_kernels.cc
 create mode 100644 lib/kernels/src/kernels/linear_kernels_cpu.cc
 create mode 100644 lib/kernels/src/kernels/loss_function_kernels.cc
 create mode 100644 lib/kernels/src/kernels/loss_function_kernels_cpu.cc
 create mode 100644 lib/kernels/src/kernels/optimizer_kernels.cc
 create mode 100644 lib/kernels/src/kernels/optimizer_kernels_cpu.cc
 create mode 100644 lib/kernels/src/kernels/pool_2d_kernels.cc
 create mode 100644 lib/kernels/src/kernels/pool_2d_kernels_cpu.cc
 create mode 100644 lib/kernels/src/kernels/reduce_kernels.cc
 create mode 100644 lib/kernels/src/kernels/reduce_kernels_cpu.cc
 create mode 100644 lib/kernels/src/kernels/reshape_kernels.cc
 create mode 100644 lib/kernels/src/kernels/reshape_kernels_cpu.cc
 create mode 100644 lib/kernels/src/kernels/reverse_kernels.cc
 rename lib/kernels/src/{cpu/ops/reverse_kernels.cc => kernels/reverse_kernels_cpu.cc} (64%)
 create mode 100644 lib/kernels/src/kernels/softmax_kernels.cc
 create mode 100644 lib/kernels/src/kernels/softmax_kernels_cpu.cc
 create mode 100644 lib/kernels/src/kernels/split_kernels.cc
 create mode 100644 lib/kernels/src/kernels/split_kernels_cpu.cc
 create mode 100644 lib/kernels/src/kernels/tensor_accessor_binary_ops.cc
 create mode 100644 lib/kernels/src/kernels/tensor_accessor_unary_ops.cc
 create mode 100644 lib/kernels/src/kernels/topk_kernels.cc
 create mode 100644 lib/kernels/src/kernels/topk_kernels_cpu.cc
 create mode 100644 lib/kernels/src/kernels/transpose_kernels.cc
 create mode 100644 lib/kernels/src/kernels/transpose_kernels_cpu.cc
 delete mode 100644 lib/kernels/test/src/cpu/ops/replicate_kernels.cc
 rename lib/kernels/{src => test/src/internal}/test_utils.cc (85%)
 rename lib/kernels/{include/kernels => test/src/internal}/test_utils.h (96%)
 delete mode 100644 lib/kernels/test/src/kernels/array_shape.cc
 create mode 100644 lib/kernels/test/src/kernels/linear_kernels.cc
 create mode 100644 lib/kernels/test/src/kernels/linear_kernels_cpu.cc
 rename lib/kernels/test/src/{cpu/ops/reverse_kernels.cc => kernels/reverse_kernels_cpu.cc} (99%)
 create mode 100644 lib/kernels/test/src/kernels/tensor_accessor_unary_ops.cc
 delete mode 100644 lib/kernels/test/src/test_combine_kernel.cc
 delete mode 100644 lib/kernels/test/src/test_managed_ff_stream.cc
 delete mode 100644 lib/kernels/test/src/test_partition_kernel.cc
 delete mode 100644 lib/kernels/test/src/test_reduction_kernel.cc
 delete mode 100644 lib/kernels/test/src/test_replicate_kernel.cc
 delete mode 100644 lib/local-execution/include/local-execution/allocated_tensors.h
 delete mode 100644 lib/local-execution/include/local-execution/allocated_tensors.struct.toml
 delete mode 100644 lib/local-execution/include/local-execution/cost_details.struct.toml
 delete mode 100644 lib/local-execution/include/local-execution/cost_estimate.h
 delete mode 100644 lib/local-execution/include/local-execution/cost_metrics.h
 delete mode 100644 lib/local-execution/include/local-execution/gradient_tensor_source.h
 create mode 100644 lib/local-execution/include/local-execution/local_args_backing.struct.toml
 create mode 100644 lib/local-execution/include/local-execution/local_task_registry.h
 create mode 100644 lib/local-execution/include/local-execution/local_task_registry.struct.toml
 create mode 100644 lib/local-execution/include/local-execution/local_training_backing.struct.toml
 create mode 100644 lib/local-execution/include/local-execution/operator_task_set.h
 create mode 100644 lib/local-execution/include/local-execution/operator_task_set.struct.toml
 delete mode 100644 lib/local-execution/include/local-execution/optimizer_tensor_source.h
 create mode 100644 lib/local-execution/include/local-execution/registered_task.h
 create mode 100644 lib/local-execution/include/local-execution/registered_task_t.variant.toml
 delete mode 100644 lib/local-execution/include/local-execution/task_registry.h
 delete mode 100644 lib/local-execution/include/local-execution/task_registry.struct.toml
 create mode 100644 lib/local-execution/include/local-execution/tensor_slot_backing.variant.toml
 delete mode 100644 lib/local-execution/include/local-execution/unallocated_tensors.h
 delete mode 100644 lib/local-execution/include/local-execution/unallocated_tensors.struct.toml
 delete mode 100644 lib/local-execution/src/allocated_tensors.cc
 create mode 100644 lib/local-execution/src/local-execution/local_args_backing.cc
 create mode 100644 lib/local-execution/src/local-execution/local_cost_estimator.cc
 create mode 100644 lib/local-execution/src/local-execution/local_task_registry.cc
 create mode 100644 lib/local-execution/src/local-execution/local_tensor_backing.cc
 create mode 100644 lib/local-execution/src/local-execution/local_training_backing.cc
 create mode 100644 lib/local-execution/src/local-execution/model_training_instance.cc
 create mode 100644 lib/local-execution/src/local-execution/operator_task_set.cc
 create mode 100644 lib/local-execution/src/local-execution/registered_task.cc
 delete mode 100644 lib/local-execution/src/local_args_backing.cc
 delete mode 100644 lib/local-execution/src/local_cost_estimator.cc
 delete mode 100644 lib/local-execution/src/local_tensor_backing.cc
 delete mode 100644 lib/local-execution/src/local_training_backing.cc
 delete mode 100644 lib/local-execution/src/loss_tensor_source.cc
 delete mode 100644 lib/local-execution/src/model_training_instance.cc
 delete mode 100644 lib/local-execution/src/task_registry.cc
 delete mode 100644 lib/local-execution/src/unallocated_tensors.cc
 rename lib/local-execution/test/src/{ => internal}/test_utils.cc (94%)
 rename lib/local-execution/test/src/{ => internal}/test_utils.h (100%)
 create mode 100644 lib/local-execution/test/src/local-execution/local_cost_estimator.cc
 rename lib/local-execution/test/src/{test_local_task_arg_accessor.cc => local-execution/local_task_argument_accessor.cc} (86%)
 create mode 100644 lib/local-execution/test/src/local-execution/local_task_registry.cc
 create mode 100644 lib/local-execution/test/src/local-execution/local_tensor_backing.cc
 rename lib/local-execution/test/src/{test_update.cc => local-execution/local_training_backing.cc} (68%)
 rename lib/local-execution/test/src/{test_loss_functions.cc => local-execution/loss_functions.cc} (54%)
 delete mode 100644 lib/local-execution/test/src/test_allocated_tensors.cc
 delete mode 100644 lib/local-execution/test/src/test_local_cost_estimator.cc
 delete mode 100644 lib/local-execution/test/src/test_local_tensor_backing.cc
 delete mode 100644 lib/local-execution/test/src/test_task_registry.cc
 delete mode 100644 lib/local-execution/test/src/test_unallocated_tensors.cc
 create mode 100644 lib/op-attrs/include/op-attrs/ff_ordered/filtrans.h
 create mode 100644 lib/op-attrs/include/op-attrs/ff_ordered/reversed.h
 create mode 100644 lib/op-attrs/include/op-attrs/ff_ordered/zip_with.cc
 create mode 100644 lib/op-attrs/include/op-attrs/ff_ordered/zip_with.h
 create mode 100644 lib/op-attrs/include/op-attrs/tensor_dims_coord.h
 rename lib/{kernels/include/kernels/array_coord.struct.toml => op-attrs/include/op-attrs/tensor_dims_coord.struct.toml} (74%)
 create mode 100644 lib/op-attrs/src/op-attrs/ff_ordered/filtrans.cc
 create mode 100644 lib/op-attrs/src/op-attrs/ff_ordered/reversed.cc
 rename lib/{kernels/src/kernels/array_coord.cc => op-attrs/src/op-attrs/tensor_dims_coord.cc} (53%)
 create mode 100644 lib/op-attrs/test/src/op-attrs/ff_ordered/reversed.cc
 create mode 100644 lib/op-attrs/test/src/op-attrs/ff_ordered/zip_with.cc
 rename lib/{kernels/test/src/kernels/array_coord.cc => op-attrs/test/src/op-attrs/tensor_dims_coord.cc} (59%)
 create mode 100644 lib/pcg/include/pcg/cg_operator_plus_signature.struct.toml
 create mode 100644 lib/pcg/include/pcg/cg_operator_tensor_shape_signature.h
 create mode 100644 lib/pcg/include/pcg/cg_operator_tensor_shape_signature.struct.toml
 create mode 100644 lib/pcg/include/pcg/pcg_operator_plus_signature.struct.toml
 create mode 100644 lib/pcg/include/pcg/pcg_operator_tensor_shape_signature.struct.toml
 rename lib/{task-spec/include/task-spec => pcg/include/pcg}/tensor_role.enum.toml (100%)
 create mode 100644 lib/pcg/src/pcg/cg_operator_tensor_shape_signature.cc
 delete mode 100644 lib/realm-backend/include/realm-backend/realm_args_backing.h
 delete mode 100644 lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h
 delete mode 100644 lib/realm-backend/include/realm-backend/realm_tensor_backing.h
 delete mode 100644 lib/realm-backend/include/realm-backend/realm_tensor_backing.struct.toml
 delete mode 100644 lib/realm-backend/src/realm_args_backing.cc
 delete mode 100644 lib/realm-backend/src/realm_task_argument_accessor.cc
 delete mode 100644 lib/realm-backend/src/realm_tensor_backing.cc
 create mode 100644 lib/realm-backend/src/realm_training_backing copy.cc
 delete mode 100644 lib/runtime/src/ops/embedding.cc
 rename lib/task-spec/include/task-spec/{concrete_arg.h => concrete_arg_spec.h} (89%)
 rename lib/task-spec/include/task-spec/{optimizer_tensor_t.struct.toml => forward_tensor_guid_t.struct.toml} (79%)
 create mode 100644 lib/task-spec/include/task-spec/forward_tensor_source.h
 rename lib/task-spec/include/task-spec/{gradient_tensor_t.struct.toml => gradient_tensor_guid_t.struct.toml} (78%)
 create mode 100644 lib/task-spec/include/task-spec/gradient_tensor_source.h
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/loss_functions.h (69%)
 rename lib/task-spec/include/task-spec/{loss_tensor_t.struct.toml => loss_tensor_guid_t.struct.toml} (87%)
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/loss_tensor_source.h (50%)
 create mode 100644 lib/task-spec/include/task-spec/op_task_binding.h
 create mode 100644 lib/task-spec/include/task-spec/op_task_invocation.struct.toml
 create mode 100644 lib/task-spec/include/task-spec/op_tensor_spec.struct.toml
 delete mode 100644 lib/task-spec/include/task-spec/ops/combine.h
 delete mode 100644 lib/task-spec/include/task-spec/ops/reduction.h
 delete mode 100644 lib/task-spec/include/task-spec/ops/repartition.h
 delete mode 100644 lib/task-spec/include/task-spec/ops/replicate.h
 rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/optimizer.h (51%)
 rename lib/task-spec/include/task-spec/{lowered_tensor_t.struct.toml => optimizer_tensor_guid_t.struct.toml} (78%)
 create mode 100644 lib/task-spec/include/task-spec/optimizer_tensor_source.h
 create mode 100644 lib/task-spec/include/task-spec/runtime_arg_config.struct.toml
 create mode 100644 lib/task-spec/include/task-spec/runtime_arg_ref_type.enum.toml
 rename lib/task-spec/include/task-spec/{slot_tensor_type_id.struct.toml => tensor_sub_slot_id_t.struct.toml} (90%)
 delete mode 100644 lib/task-spec/include/task-spec/tensor_type_t.variant.toml
 create mode 100644 lib/task-spec/include/task-spec/training_computation_graph.h
 create mode 100644 lib/task-spec/include/task-spec/training_computation_graph.struct.toml
 create mode 100644 lib/task-spec/include/task-spec/training_layer_plus_context.h
 create mode 100644 lib/task-spec/include/task-spec/training_layer_plus_context.struct.toml
 create mode 100644 lib/task-spec/include/task-spec/training_layer_tensor_group_signature.h
 create mode 100644 lib/task-spec/include/task-spec/training_layer_tensor_group_signature.struct.toml
 create mode 100644 lib/task-spec/include/task-spec/training_tensor_group.h
 create mode 100644 lib/task-spec/include/task-spec/training_tensor_group.struct.toml
 create mode 100644 lib/task-spec/include/task-spec/training_tensor_group_with_attrs.h
 create mode 100644 lib/task-spec/include/task-spec/training_tensor_group_with_attrs.struct.toml
 create mode 100644 lib/task-spec/include/task-spec/training_tensor_guid_t.variant.toml
 rename lib/task-spec/src/task-spec/{concrete_arg.cc => concrete_arg_spec.cc} (94%)
 create mode 100644 lib/task-spec/src/task-spec/forward_tensor_source.cc
 rename lib/{local-execution/src => task-spec/src/task-spec}/gradient_tensor_source.cc (55%)
 rename lib/{local-execution/src => task-spec/src/task-spec}/loss_functions.cc (66%)
 create mode 100644 lib/task-spec/src/task-spec/loss_tensor_source.cc
 delete mode 100644 lib/task-spec/src/task-spec/ops/combine.cc
 create mode 100644 lib/task-spec/src/task-spec/ops/embedding.cc
 delete mode 100644 lib/task-spec/src/task-spec/ops/reduction.cc
 delete mode 100644 lib/task-spec/src/task-spec/ops/repartition.cc
 delete mode 100644 lib/task-spec/src/task-spec/ops/replicate.cc
 rename lib/{local-execution/src => task-spec/src/task-spec}/optimizer.cc (76%)
 rename lib/{local-execution/src => task-spec/src/task-spec}/optimizer_tensor_source.cc (55%)
 create mode 100644 lib/task-spec/src/task-spec/profiling.cc
 create mode 100644 lib/task-spec/src/task-spec/runtime_arg_config.cc
 create mode 100644 lib/task-spec/src/task-spec/training_computation_graph.cc
 create mode 100644 lib/task-spec/src/task-spec/training_layer_plus_context.cc
 create mode 100644 lib/task-spec/src/task-spec/training_layer_tensor_group_signature.cc
 create mode 100644 lib/task-spec/src/task-spec/training_tensor_group.cc
 create mode 100644 lib/task-spec/src/task-spec/training_tensor_group_with_attrs.cc
 create mode 100644 lib/task-spec/test/src/task-spec/training_tensor_group.cc
 create mode 100644 lib/task-spec/test/src/task-spec/training_tensor_group_with_attrs.cc
 create mode 100644 lib/utils/include/utils/containers/all_are_true.h
 create mode 100644 lib/utils/include/utils/containers/collapse_optionals.h
 create mode 100644 lib/utils/include/utils/containers/contains_value.h
 create mode 100644 lib/utils/include/utils/fmt/half.h
 rename lib/utils/include/utils/{fp16.h => half.h} (100%)
 create mode 100644 lib/utils/include/utils/json/half.h
 create mode 100644 lib/utils/include/utils/rapidcheck/half.h
 create mode 100644 lib/utils/include/utils/rapidcheck/monostate.h
 create mode 100644 lib/utils/include/utils/units/milliseconds_t.h
 create mode 100644 lib/utils/include/utils/units/num_bytes_t.h
 rename lib/utils/src/{fp16.cc => half.cc} (87%)
 create mode 100644 lib/utils/src/utils/containers/all_are_true.cc
 create mode 100644 lib/utils/src/utils/containers/collapse_optionals.cc
 create mode 100644 lib/utils/src/utils/containers/contains_value.cc
 create mode 100644 lib/utils/src/utils/fmt/half.cc
 create mode 100644 lib/utils/src/utils/json/half.cc
 create mode 100644 lib/utils/src/utils/rapidcheck/half.cc
 create mode 100644 lib/utils/src/utils/rapidcheck/monostate.cc
 create mode 100644 lib/utils/src/utils/units/milliseconds_t.cc
 create mode 100644 lib/utils/src/utils/units/num_bytes_t.cc
 create mode 100644 lib/utils/test/common/include/test/utils/doctest/fmt/half.h
 create mode 100644 lib/utils/test/common/src/test/utils/doctest/fmt/half.cc
 create mode 100644 lib/utils/test/src/utils/containers/all_are_true.cc
 create mode 100644 lib/utils/test/src/utils/containers/collapse_optionals.cc
 create mode 100644 lib/utils/test/src/utils/containers/contains_value.cc

diff --git a/.flake/pkgs/ffdb/ffdb.py b/.flake/pkgs/ffdb/ffdb.py
index 84354ccd82..b5fc3956bf 100644
--- a/.flake/pkgs/ffdb/ffdb.py
+++ b/.flake/pkgs/ffdb/ffdb.py
@@ -5,3 +5,4 @@
 gdb.execute(f'directory {get_config_root(Path.cwd())}')
 gdb.prompt_hook = lambda x: '(ffdb) '
 gdb.execute('set history save on')
+gdb.execute('catch throw')
diff --git a/bin/export-model-arch/src/export_model_arch.cc b/bin/export-model-arch/src/export_model_arch.cc
index 2dfbc275ec..82aebd2b2c 100644
--- a/bin/export-model-arch/src/export_model_arch.cc
+++ b/bin/export-model-arch/src/export_model_arch.cc
@@ -23,11 +23,11 @@ using namespace ::FlexFlow;
 ComputationGraph get_single_operator_computation_graph() {
   ComputationGraphBuilder b;
 
-  nonnegative_int batch_size = 8_n;
-  nonnegative_int in_channels = 16_n;
-  nonnegative_int out_channels = 12_n;
+  positive_int batch_size = 8_p;
+  positive_int in_channels = 16_p;
+  positive_int out_channels = 12_p;
   TensorShape input_shape = TensorShape{
-      TensorDims{FFOrdered<nonnegative_int>{
+      TensorDims{FFOrdered{
           batch_size,
           in_channels,
           out_channels,
@@ -73,7 +73,7 @@ tl::expected<ComputationGraph, std::string>
   } else if (model_name == "dlrm") {
     return get_dlrm_computation_graph(get_default_dlrm_config());
   } else if (model_name == "split_test") {
-    nonnegative_int batch_size = 8_n;
+    positive_int batch_size = 8_p;
     return get_split_test_computation_graph(batch_size);
   } else if (model_name == "single_operator") {
     return get_single_operator_computation_graph();
diff --git a/flake.lock b/flake.lock
index ff6e797d51..f016c47f45 100644
--- a/flake.lock
+++ b/flake.lock
@@ -66,17 +66,17 @@
         ]
       },
       "locked": {
-        "lastModified": 1746157536,
-        "narHash": "sha256-g4Hx/05+Ce3hl8OS1zm4pY/+ThD1blWKmcaPsohSX5Y=",
-        "owner": "lockshaw",
-        "repo": "proj",
-        "rev": "5871bc7b7fb9d7d7f14c8bca6c50a0cf2e75834d",
-        "type": "github"
+        "lastModified": 1752259929,
+        "narHash": "sha256-GkMRIi6Xk3qswrbekWtO1sQYz61mw25+62boDk1Gd7s=",
+        "ref": "refs/heads/master",
+        "rev": "669773600c781ab8b29ac2379d0c070721417f9d",
+        "revCount": 117,
+        "type": "git",
+        "url": "https://git.sr.ht/~lockshaw/proj"
       },
       "original": {
-        "owner": "lockshaw",
-        "repo": "proj",
-        "type": "github"
+        "type": "git",
+        "url": "https://git.sr.ht/~lockshaw/proj"
       }
     },
     "root": {
diff --git a/flake.nix b/flake.nix
index 5fa48fa3fd..474a22f385 100644
--- a/flake.nix
+++ b/flake.nix
@@ -18,7 +18,7 @@
     flake-utils.url = "github:numtide/flake-utils";
 
     proj-repo = {
-      url = "github:lockshaw/proj";
+      url = "git+https://git.sr.ht/~lockshaw/proj";
       inputs.nixpkgs.follows = "nixpkgs";
       inputs.flake-utils.follows = "flake-utils";
     };
@@ -121,6 +121,7 @@
               lcov # for code coverage
               compdb
               gbenchmark
+              libtorch-bin
             ])
             (with proj-repo.packages.${system}; [
               proj
@@ -177,6 +178,7 @@
               frozendict
               black
               toml
+              numpy
             ])
             (with self.packages.${system}; [
               ffdb
diff --git a/lib/compiler/include/compiler/cost_estimator/cost_estimator.h b/lib/compiler/include/compiler/cost_estimator/cost_estimator.h
index ecaffa337b..7b7255a89d 100644
--- a/lib/compiler/include/compiler/cost_estimator/cost_estimator.h
+++ b/lib/compiler/include/compiler/cost_estimator/cost_estimator.h
@@ -13,7 +13,7 @@ namespace FlexFlow {
 
 struct ICostEstimator {
   virtual OpCostMetrics estimate_cost(OpCostEstimateKey const &) const = 0;
-  virtual float estimate_cost(TensorSetMovement const &) const = 0;
+  virtual milliseconds_t estimate_cost(TensorSetMovement const &) const = 0;
 
   ICostEstimator() = default;
   ICostEstimator(ICostEstimator const &) = delete;
@@ -25,7 +25,7 @@ CHECK_RC_COPY_VIRTUAL_COMPLIANT(ICostEstimator);
 
 struct CostEstimator {
   OpCostMetrics estimate_cost(OpCostEstimateKey const &) const;
-  float estimate_cost(TensorSetMovement const &m) const;
+  milliseconds_t estimate_cost(TensorSetMovement const &m) const;
 
   template <typename T, typename... Args>
   static typename std::enable_if<std::is_base_of<ICostEstimator, T>::value,
diff --git a/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.h b/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.h
index 93a1143cde..d905abeb77 100644
--- a/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.h
+++ b/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_OP_COST_ESTIMATE_KEY_H
 
 #include "compiler/cost_estimator/op_cost_estimate_key.dtg.h"
+#include "compiler/cost_estimator/runtime_only_op_cost_estimate_key.dtg.h"
 #include "pcg/device_id_t.dtg.h"
 #include "pcg/machine_specification.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
@@ -11,9 +12,17 @@ namespace FlexFlow {
 
 OpCostEstimateKey get_mapped_op_cost_estimate_key_for_layer(
     ParallelComputationGraph const &pcg,
-    parallel_layer_guid_t const &layer,
+    OptimizerAttrs const &optimizer_attrs,
+    parallel_layer_guid_t const &parallel_layer_guid,
     MachineView const &machine_view);
 
+RuntimeOnlyOpCostEstimateKey
+    runtime_only_from_op_cost_estimate_key(OpCostEstimateKey const &);
+
+OpCostEstimateKey make_op_cost_estimate_key_from_runtime_only(
+    RuntimeOnlyOpCostEstimateKey const &runtime_only,
+    OptimizerAttrs const &optimizer_attrs);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.struct.toml b/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.struct.toml
index 8fd860d00d..b153bd0072 100644
--- a/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.struct.toml
+++ b/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.struct.toml
@@ -12,6 +12,7 @@ includes = [
   "op-attrs/parallel_tensor_shape.dtg.h",
   "<vector>",
   "pcg/machine_view.dtg.h",
+  "pcg/optimizer_attrs.dtg.h",
 ]
 
 src_includes = [
@@ -35,6 +36,10 @@ type = "std::vector<::FlexFlow::ParallelTensorShape>"
 name = "output_shapes"
 type = "std::vector<::FlexFlow::ParallelTensorShape>"
 
+[[fields]]
+name = "optimizer_attrs"
+type = "::FlexFlow::OptimizerAttrs"
+
 [[fields]]
 name = "machine_view"
 type = "::FlexFlow::MachineView"
diff --git a/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.h b/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.h
new file mode 100644
index 0000000000..f2d12aff71
--- /dev/null
+++ b/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.h
@@ -0,0 +1,15 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_OP_COST_METRICS_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_OP_COST_METRICS_H
+
+#include "compiler/cost_estimator/op_cost_metrics.dtg.h"
+#include "compiler/cost_estimator/runtime_only_op_cost_metrics.dtg.h"
+
+namespace FlexFlow {
+
+OpCostMetrics make_op_cost_metrics_from_runtime_only(
+    RuntimeOnlyOpCostMetrics const &runtime_only,
+    num_bytes_t const &memory_usage);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml b/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml
index 5e81d6c10e..7d0c7684a9 100644
--- a/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml
+++ b/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml
@@ -7,17 +7,18 @@ features = [
 ]
 
 includes = [
-  "utils/nonnegative_int/nonnegative_int.h"
+  "utils/units/milliseconds_t.h",
+  "utils/units/num_bytes_t.h",
 ]
 
 [[fields]]
 name = "forward_runtime"
-type = "float"
+type = "::FlexFlow::milliseconds_t"
 
 [[fields]]
 name = "backward_runtime"
-type = "float"
+type = "::FlexFlow::milliseconds_t"
 
 [[fields]]
-name = "memory"
-type = "::FlexFlow::nonnegative_int"
+name = "memory_usage"
+type = "::FlexFlow::num_bytes_t"
diff --git a/lib/compiler/include/compiler/cost_estimator/runtime_only_cost_estimator.h b/lib/compiler/include/compiler/cost_estimator/runtime_only_cost_estimator.h
new file mode 100644
index 0000000000..aa1c2d70b6
--- /dev/null
+++ b/lib/compiler/include/compiler/cost_estimator/runtime_only_cost_estimator.h
@@ -0,0 +1,52 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_RUNTIME_ONLY_COST_ESTIMATOR_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_RUNTIME_ONLY_COST_ESTIMATOR_H
+
+#include "compiler/cost_estimator/runtime_only_op_cost_estimate_key.dtg.h"
+#include "compiler/cost_estimator/runtime_only_op_cost_metrics.dtg.h"
+#include "compiler/cost_estimator/tensor_set_movement.dtg.h"
+#include "op-attrs/parallel_tensor_shape.dtg.h"
+#include "op-attrs/pcg_operator_attrs.dtg.h"
+#include "pcg/machine_view.dtg.h"
+#include <vector>
+
+namespace FlexFlow {
+
+struct IRuntimeOnlyCostEstimator {
+  virtual RuntimeOnlyOpCostMetrics
+      estimate_cost(RuntimeOnlyOpCostEstimateKey const &) const = 0;
+  virtual milliseconds_t estimate_cost(TensorSetMovement const &) const = 0;
+
+  IRuntimeOnlyCostEstimator() = default;
+  IRuntimeOnlyCostEstimator(IRuntimeOnlyCostEstimator const &) = delete;
+  IRuntimeOnlyCostEstimator &
+      operator=(IRuntimeOnlyCostEstimator const &) = delete;
+
+  virtual ~IRuntimeOnlyCostEstimator() = default;
+};
+CHECK_RC_COPY_VIRTUAL_COMPLIANT(IRuntimeOnlyCostEstimator);
+
+struct RuntimeOnlyCostEstimator {
+  RuntimeOnlyOpCostMetrics
+      estimate_cost(RuntimeOnlyOpCostEstimateKey const &) const;
+  milliseconds_t estimate_cost(TensorSetMovement const &m) const;
+
+  template <typename T, typename... Args>
+  static typename std::enable_if<
+      std::is_base_of<IRuntimeOnlyCostEstimator, T>::value,
+      RuntimeOnlyCostEstimator>::type
+      create(Args &&...args) {
+    return RuntimeOnlyCostEstimator(
+        std::make_shared<T>(std::forward<Args>(args)...));
+  }
+
+private:
+  RuntimeOnlyCostEstimator(
+      std::shared_ptr<IRuntimeOnlyCostEstimator> implementation_ptr);
+
+private:
+  std::shared_ptr<IRuntimeOnlyCostEstimator> implementation_ptr;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/cost_estimator/runtime_only_cost_estimator_from_cost_estimator.h b/lib/compiler/include/compiler/cost_estimator/runtime_only_cost_estimator_from_cost_estimator.h
new file mode 100644
index 0000000000..5757560f9d
--- /dev/null
+++ b/lib/compiler/include/compiler/cost_estimator/runtime_only_cost_estimator_from_cost_estimator.h
@@ -0,0 +1,28 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_RUNTIME_ONLY_COST_ESTIMATOR_FROM_COST_ESTIMATOR_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_RUNTIME_ONLY_COST_ESTIMATOR_FROM_COST_ESTIMATOR_H
+
+#include "compiler/cost_estimator/cost_estimator.h"
+#include "compiler/cost_estimator/runtime_only_cost_estimator.h"
+
+namespace FlexFlow {
+
+struct RuntimeOnlyCostEstimatorFromCostEstimator final
+    : public IRuntimeOnlyCostEstimator {
+  RuntimeOnlyCostEstimatorFromCostEstimator() = delete;
+  RuntimeOnlyCostEstimatorFromCostEstimator(
+      CostEstimator const &cost_estimator);
+
+  RuntimeOnlyOpCostMetrics
+      estimate_cost(RuntimeOnlyOpCostEstimateKey const &) const override;
+  milliseconds_t estimate_cost(TensorSetMovement const &) const override;
+
+private:
+  CostEstimator cost_estimator;
+};
+
+RuntimeOnlyCostEstimator
+    runtime_only_cost_estimator_from_cost_estimator(CostEstimator const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_estimate_key.h b/lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_estimate_key.h
new file mode 100644
index 0000000000..fc3157d74a
--- /dev/null
+++ b/lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_estimate_key.h
@@ -0,0 +1,18 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_RUNTIME_ONLY_OP_COST_ESTIMATE_KEY_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_RUNTIME_ONLY_OP_COST_ESTIMATE_KEY_H
+
+#include "compiler/cost_estimator/runtime_only_op_cost_estimate_key.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
+
+namespace FlexFlow {
+
+RuntimeOnlyOpCostEstimateKey
+    get_mapped_runtime_only_op_cost_estimate_key_for_layer(
+        ParallelComputationGraph const &pcg,
+        parallel_layer_guid_t const &parallel_layer_guid,
+        MachineView const &machine_view);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_estimate_key.struct.toml b/lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_estimate_key.struct.toml
new file mode 100644
index 0000000000..94be6f6e69
--- /dev/null
+++ b/lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_estimate_key.struct.toml
@@ -0,0 +1,40 @@
+namespace = "FlexFlow"
+name = "RuntimeOnlyOpCostEstimateKey"
+features = [
+  "eq",
+  "ord",
+  "fmt",
+  "hash",
+]
+
+includes = [
+  "op-attrs/pcg_operator_attrs.dtg.h",
+  "op-attrs/parallel_tensor_shape.dtg.h",
+  "<vector>",
+  "pcg/machine_view.dtg.h",
+]
+
+src_includes = [
+  "utils/hash/vector.h",
+  "utils/fmt/vector.h",
+]
+
+[[fields]]
+name = "op_attrs"
+type = "::FlexFlow::PCGOperatorAttrs"
+
+[[fields]]
+name = "input_shapes"
+type = "std::vector<::FlexFlow::ParallelTensorShape>"
+
+[[fields]]
+name = "weight_shapes"
+type = "std::vector<::FlexFlow::ParallelTensorShape>"
+
+[[fields]]
+name = "output_shapes"
+type = "std::vector<::FlexFlow::ParallelTensorShape>"
+
+[[fields]]
+name = "machine_view"
+type = "::FlexFlow::MachineView"
diff --git a/lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_metrics.h b/lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_metrics.h
new file mode 100644
index 0000000000..6b4e34fd75
--- /dev/null
+++ b/lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_metrics.h
@@ -0,0 +1,14 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_RUNTIME_ONLY_OP_COST_METRICS_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_RUNTIME_ONLY_OP_COST_METRICS_H
+
+#include "compiler/cost_estimator/op_cost_metrics.dtg.h"
+#include "compiler/cost_estimator/runtime_only_op_cost_metrics.dtg.h"
+
+namespace FlexFlow {
+
+RuntimeOnlyOpCostMetrics
+    runtime_only_from_op_cost_metrics(OpCostMetrics const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_metrics.struct.toml b/lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_metrics.struct.toml
new file mode 100644
index 0000000000..65ac318f0e
--- /dev/null
+++ b/lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_metrics.struct.toml
@@ -0,0 +1,19 @@
+namespace = "FlexFlow"
+name = "RuntimeOnlyOpCostMetrics"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+]
+
+includes = [
+  "utils/units/milliseconds_t.h",
+]
+
+[[fields]]
+name = "forward_runtime"
+type = "::FlexFlow::milliseconds_t"
+
+[[fields]]
+name = "backward_runtime"
+type = "::FlexFlow::milliseconds_t"
diff --git a/lib/compiler/include/compiler/machine_mapping/feasible_machine_mapping_result.struct.toml b/lib/compiler/include/compiler/machine_mapping/feasible_machine_mapping_result.struct.toml
index e71cfc540f..8dda2d15ba 100644
--- a/lib/compiler/include/compiler/machine_mapping/feasible_machine_mapping_result.struct.toml
+++ b/lib/compiler/include/compiler/machine_mapping/feasible_machine_mapping_result.struct.toml
@@ -8,11 +8,12 @@ features = [
 
 includes = [
   "compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.dtg.h",
+  "utils/units/milliseconds_t.h",
 ]
 
 [[fields]]
 name = "runtime"
-type = "float"
+type = "::FlexFlow::milliseconds_t"
 
 [[fields]]
 name = "machine_mapping"
diff --git a/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h
index 62da90bfcb..2cd3f3e289 100644
--- a/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h
+++ b/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h
@@ -35,12 +35,12 @@ MachineMappingResult get_optimal_machine_mapping(
     MachineSpecification const &resources,
     MachineMappingConstraints const &constraints);
 
-MachineMappingResult
-    get_optimal_machine_mapping(MachineMappingCache &result_cache,
-                                MachineMappingContext const &,
-                                UnmappedOpCostEstimateKey const &leaf,
-                                MachineSpecification const &resources,
-                                MachineMappingConstraints const &constraints);
+MachineMappingResult get_optimal_machine_mapping(
+    MachineMappingCache &result_cache,
+    MachineMappingContext const &,
+    UnmappedRuntimeOnlyOpCostEstimateKey const &leaf,
+    MachineSpecification const &resources,
+    MachineMappingConstraints const &constraints);
 
 } // namespace FlexFlow
 
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_context.struct.toml b/lib/compiler/include/compiler/machine_mapping/machine_mapping_context.struct.toml
index 81e26f491d..dd49aaa98a 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_context.struct.toml
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_context.struct.toml
@@ -3,16 +3,16 @@ name = "MachineMappingContext"
 features = []
 
 includes = [
-  "compiler/cost_estimator/cost_estimator.h",
+  "compiler/cost_estimator/runtime_only_cost_estimator.h",
   "pcg/machine_view.dtg.h",
   "pcg/machine_specification.dtg.h",
-  "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.dtg.h",
+  "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.dtg.h",
 ]
 
 [[fields]]
 name = "cost_estimator"
-type = "::FlexFlow::CostEstimator"
+type = "::FlexFlow::RuntimeOnlyCostEstimator"
 
 [[fields]]
 name = "allowed_machine_views"
-type = "std::function<std::unordered_set<::FlexFlow::MachineView>(::FlexFlow::UnmappedOpCostEstimateKey const &, ::FlexFlow::MachineSpecification const &)>"
+type = "std::function<std::unordered_set<::FlexFlow::MachineView>(::FlexFlow::UnmappedRuntimeOnlyOpCostEstimateKey const &, ::FlexFlow::MachineSpecification const &)>"
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h
index 29e9e7c90b..65f7006b21 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h
@@ -10,15 +10,16 @@
 
 namespace FlexFlow {
 
-GenericBinarySPDecompositionTreeImplementation<MachineMappingProblemTree,
-                                               MMProblemTreeSeriesSplit,
-                                               MMProblemTreeParallelSplit,
-                                               UnmappedOpCostEstimateKey>
+GenericBinarySPDecompositionTreeImplementation<
+    MachineMappingProblemTree,
+    MMProblemTreeSeriesSplit,
+    MMProblemTreeParallelSplit,
+    UnmappedRuntimeOnlyOpCostEstimateKey>
     generic_binary_sp_impl_for_mm_problem_tree();
 
 SPDecompositionTreeNodeType get_node_type(MachineMappingProblemTree const &);
 
-std::unordered_multiset<UnmappedOpCostEstimateKey>
+std::unordered_multiset<UnmappedRuntimeOnlyOpCostEstimateKey>
     get_leaves(MachineMappingProblemTree const &);
 std::unordered_set<BinaryTreePath>
     get_all_leaf_paths(MachineMappingProblemTree const &);
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.variant.toml b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.variant.toml
index 1949f143cb..808853994a 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.variant.toml
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.variant.toml
@@ -9,7 +9,7 @@ features = [
 includes = [
   "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_series_split.dtg.h",
   "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_parallel_split.dtg.h",
-  "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.dtg.h",
+  "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.dtg.h",
 ]
 
 [[values]]
@@ -21,5 +21,5 @@ type = "::FlexFlow::MMProblemTreeParallelSplit"
 key = "parallel"
 
 [[values]]
-type = "::FlexFlow::UnmappedOpCostEstimateKey"
+type = "::FlexFlow::UnmappedRuntimeOnlyOpCostEstimateKey"
 key = "leaf"
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h
index 9fbad4a1d0..cfffeee245 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h
@@ -3,13 +3,24 @@
 
 #include "compiler/cost_estimator/op_cost_estimate_key.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
 
 namespace FlexFlow {
 
 UnmappedOpCostEstimateKey get_unmapped_op_cost_estimate_key_for_layer(
-    ParallelComputationGraph const &, parallel_layer_guid_t const &);
+    ParallelComputationGraph const &pcg,
+    OptimizerAttrs const &optimizer_attrs,
+    parallel_layer_guid_t const &parallel_layer_guid);
+
+UnmappedOpCostEstimateKey unmapped_op_cost_estimate_key_from_runtime_only(
+    UnmappedRuntimeOnlyOpCostEstimateKey const &runtime_only,
+    OptimizerAttrs const &optimizer_attrs);
+
+UnmappedRuntimeOnlyOpCostEstimateKey
+    runtime_only_from_unmapped_op_cost_estimate_key(
+        UnmappedOpCostEstimateKey const &runtime_only);
 
 OpCostEstimateKey
     map_unmapped_op_cost_estimate_key(UnmappedOpCostEstimateKey const &unmapped,
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml
index fe76683eb7..5dcfd33859 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml
@@ -10,7 +10,7 @@ includes = [
   "op-attrs/pcg_operator_attrs.dtg.h",
   "op-attrs/parallel_tensor_shape.dtg.h",
   "<vector>",
-  "pcg/machine_view.dtg.h",
+  "pcg/optimizer_attrs.dtg.h",
 ]
 
 src_includes = [
@@ -34,3 +34,6 @@ type = "std::vector<::FlexFlow::ParallelTensorShape>"
 name = "output_shapes"
 type = "std::vector<::FlexFlow::ParallelTensorShape>"
 
+[[fields]]
+name = "optimizer_attrs"
+type = "::FlexFlow::OptimizerAttrs"
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.h
new file mode 100644
index 0000000000..c1de7cb956
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.h
@@ -0,0 +1,22 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MACHINE_MAPPING_MACHINE_MAPPING_PROBLEM_TREE_UNMAPPED_RUNTIME_ONLY_OP_COST_ESTIMATE_KEY_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MACHINE_MAPPING_MACHINE_MAPPING_PROBLEM_TREE_UNMAPPED_RUNTIME_ONLY_OP_COST_ESTIMATE_KEY_H
+
+#include "compiler/cost_estimator/runtime_only_op_cost_estimate_key.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
+
+namespace FlexFlow {
+
+UnmappedRuntimeOnlyOpCostEstimateKey
+    get_unmapped_runtime_only_op_cost_estimate_key_for_layer(
+        ParallelComputationGraph const &pcg,
+        parallel_layer_guid_t const &parallel_layer_guid);
+
+RuntimeOnlyOpCostEstimateKey map_unmapped_runtime_only_op_cost_estimate_key(
+    UnmappedRuntimeOnlyOpCostEstimateKey const &unmapped,
+    MachineView const &machine_view);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.struct.toml b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.struct.toml
new file mode 100644
index 0000000000..e38ce06f03
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.struct.toml
@@ -0,0 +1,34 @@
+namespace = "FlexFlow"
+name = "UnmappedRuntimeOnlyOpCostEstimateKey"
+features = [
+  "eq",
+  "fmt",
+  "hash",
+]
+
+includes = [
+  "op-attrs/pcg_operator_attrs.dtg.h",
+  "op-attrs/parallel_tensor_shape.dtg.h",
+  "<vector>",
+]
+
+src_includes = [
+  "utils/hash/vector.h",
+  "utils/fmt/vector.h",
+]
+
+[[fields]]
+name = "op_attrs"
+type = "::FlexFlow::PCGOperatorAttrs"
+
+[[fields]]
+name = "input_shapes"
+type = "std::vector<::FlexFlow::ParallelTensorShape>"
+
+[[fields]]
+name = "weight_shapes"
+type = "std::vector<::FlexFlow::ParallelTensorShape>"
+
+[[fields]]
+name = "output_shapes"
+type = "std::vector<::FlexFlow::ParallelTensorShape>"
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h
index b21fea5f24..8924b1c110 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h
@@ -3,6 +3,7 @@
 
 #include "compiler/machine_mapping/machine_mapping_result.dtg.h"
 #include "compiler/machine_mapping/parallel_split_transformation.dtg.h"
+#include "utils/units/milliseconds_t.h"
 
 namespace FlexFlow {
 
@@ -14,7 +15,7 @@ FeasibleMachineMappingResult require_feasible(MachineMappingResult const &);
     std::unordered_set<MachineMappingResult> const &);
 
 [[nodiscard]] MachineMappingResult
-    series_combine(float comm_cost,
+    series_combine(milliseconds_t comm_cost,
                    MachineMappingResult const &pre_result,
                    MachineMappingResult const &post_result,
                    std::optional<ParallelSplitTransformation> const
@@ -28,7 +29,7 @@ FeasibleMachineMappingResult require_feasible(MachineMappingResult const &);
                      MachineMappingResult const &m2);
 
 [[nodiscard]] MachineMappingResult
-    make_singleton_machine_mapping_result(float runtime,
+    make_singleton_machine_mapping_result(milliseconds_t runtime,
                                           MachineView const &machine_view);
 
 } // namespace FlexFlow
diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h b/lib/compiler/include/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h
index d176d298db..74c6aee851 100644
--- a/lib/compiler/include/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h
+++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h
@@ -3,11 +3,12 @@
 
 #include "compiler/machine_mapping/machine_mapping_cache.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_constraints.dtg.h"
-#include "compiler/machine_mapping/machine_mapping_context.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_parallel_split.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_series_split.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.dtg.h"
 #include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.dtg.h"
+#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_context.dtg.h"
 #include "compiler/machine_mapping/parallel_split_transformation.dtg.h"
 #include "pcg/machine_specification.dtg.h"
 
@@ -15,14 +16,14 @@ namespace FlexFlow {
 
 MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
     MachineMappingWithMemoryCache &result_cache,
-    MachineMappingContext const &context,
+    MachineMappingWithMemoryContext const &context,
     MachineMappingProblemTree const &problem_tree,
     MachineSpecification const &resources,
     MachineMappingConstraints const &constraints);
 
 MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
     MachineMappingWithMemoryCache &result_cache,
-    MachineMappingContext const &context,
+    MachineMappingWithMemoryContext const &context,
     MMProblemTreeSeriesSplit const &series_split,
     MachineSpecification const &resources,
     MachineMappingConstraints const &constraints,
@@ -31,15 +32,15 @@ MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
 
 MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
     MachineMappingWithMemoryCache &result_cache,
-    MachineMappingContext const &context,
+    MachineMappingWithMemoryContext const &context,
     MMProblemTreeParallelSplit const &parallel_split,
     MachineSpecification const &resources,
     MachineMappingConstraints const &constraints);
 
 MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
     MachineMappingWithMemoryCache &result_cache,
-    MachineMappingContext const &,
-    UnmappedOpCostEstimateKey const &leaf,
+    MachineMappingWithMemoryContext const &context,
+    UnmappedRuntimeOnlyOpCostEstimateKey const &leaf,
     MachineSpecification const &resources,
     MachineMappingConstraints const &constraints);
 
diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_context.struct.toml b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_context.struct.toml
new file mode 100644
index 0000000000..9530697632
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_context.struct.toml
@@ -0,0 +1,23 @@
+namespace = "FlexFlow"
+name = "MachineMappingWithMemoryContext"
+features = []
+
+includes = [
+  "compiler/cost_estimator/cost_estimator.h",
+  "pcg/machine_view.dtg.h",
+  "pcg/machine_specification.dtg.h",
+  "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.dtg.h",
+  "pcg/optimizer_attrs.dtg.h",
+]
+
+[[fields]]
+name = "cost_estimator"
+type = "::FlexFlow::CostEstimator"
+
+[[fields]]
+name = "optimizer_attrs"
+type = "::FlexFlow::OptimizerAttrs"
+
+[[fields]]
+name = "allowed_machine_views"
+type = "std::function<std::unordered_set<::FlexFlow::MachineView>(::FlexFlow::UnmappedRuntimeOnlyOpCostEstimateKey const &, ::FlexFlow::MachineSpecification const &)>"
diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h
index 0383376116..4cb865dece 100644
--- a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h
+++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h
@@ -19,7 +19,7 @@ namespace FlexFlow {
         MachineMappingWithMemoryResult const &);
 
 [[nodiscard]] MachineMappingWithMemoryResult
-    series_combine(float comm_cost,
+    series_combine(milliseconds_t comm_cost,
                    MachineMappingWithMemoryResult const &pre_result,
                    MachineMappingWithMemoryResult const &post_result,
                    std::optional<ParallelSplitTransformation> const
diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_state.struct.toml b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_state.struct.toml
new file mode 100644
index 0000000000..77af129094
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_state.struct.toml
@@ -0,0 +1,30 @@
+namespace = "FlexFlow"
+name = "MachineMappingWithMemoryState"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "pcg/machine_specification.dtg.h",
+  "compiler/machine_mapping/machine_mapping_constraints.dtg.h",
+  "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h",
+  "pcg/optimizer_attrs.dtg.h",
+]
+
+[[fields]]
+name = "problem_tree"
+type = "::FlexFlow::MachineMappingProblemTree"
+
+[[fields]]
+name = "resources"
+type = "::FlexFlow::MachineSpecification"
+
+[[fields]]
+name = "constraints"
+type = "::FlexFlow::MachineMappingConstraints"
+
+[[fields]]
+name = "optimizer_attrs"
+type = "::FlexFlow::OptimizerAttrs"
diff --git a/lib/compiler/include/compiler/task_graph_simulator/pcg_task.variant.toml b/lib/compiler/include/compiler/task_graph_simulator/pcg_task.variant.toml
index 13f2f17652..cb8490c861 100644
--- a/lib/compiler/include/compiler/task_graph_simulator/pcg_task.variant.toml
+++ b/lib/compiler/include/compiler/task_graph_simulator/pcg_task.variant.toml
@@ -7,12 +7,12 @@ features = [
 ]
 
 includes = [
-  "compiler/cost_estimator/op_cost_estimate_key.dtg.h",
+  "compiler/cost_estimator/runtime_only_op_cost_estimate_key.dtg.h",
   "compiler/cost_estimator/tensor_set_movement.dtg.h",
 ]
 
 [[values]]
-type = "::FlexFlow::OpCostEstimateKey"
+type = "::FlexFlow::RuntimeOnlyOpCostEstimateKey"
 key = "operator"
 
 [[values]]
diff --git a/lib/compiler/include/compiler/task_graph_simulator/task_simulator.h b/lib/compiler/include/compiler/task_graph_simulator/task_simulator.h
index b35733e419..9dadfdb155 100644
--- a/lib/compiler/include/compiler/task_graph_simulator/task_simulator.h
+++ b/lib/compiler/include/compiler/task_graph_simulator/task_simulator.h
@@ -1,15 +1,16 @@
 #ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_TASK_SIMULATOR_H
 #define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_TASK_SIMULATOR_H
 
-#include "compiler/cost_estimator/cost_estimator.h"
+#include "compiler/cost_estimator/runtime_only_cost_estimator.h"
 #include "compiler/machine_mapping/machine_mapping.dtg.h"
 #include "pcg/machine_specification.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
 
 namespace FlexFlow {
-float task_simulator_estimate_forward_pass_time(
+
+milliseconds_t task_simulator_estimate_forward_pass_time(
     ParallelComputationGraph const &pcg,
-    CostEstimator const &estimator,
+    RuntimeOnlyCostEstimator const &estimator,
     MachineMapping const &machine_mapping,
     MachineSpecification const &machine_spec);
 
diff --git a/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc b/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc
index 6ac6e3a8d6..37e7cc97fd 100644
--- a/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc
+++ b/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc
@@ -9,7 +9,7 @@ OpCostMetrics CostEstimator::estimate_cost(OpCostEstimateKey const &k) const {
   return this->implementation_ptr->estimate_cost(k);
 }
 
-float CostEstimator::estimate_cost(TensorSetMovement const &m) const {
+milliseconds_t CostEstimator::estimate_cost(TensorSetMovement const &m) const {
   return this->implementation_ptr->estimate_cost(m);
 }
 
diff --git a/lib/compiler/src/compiler/cost_estimator/op_cost_estimate_key.cc b/lib/compiler/src/compiler/cost_estimator/op_cost_estimate_key.cc
index ef5775851f..92b07bbe23 100644
--- a/lib/compiler/src/compiler/cost_estimator/op_cost_estimate_key.cc
+++ b/lib/compiler/src/compiler/cost_estimator/op_cost_estimate_key.cc
@@ -14,10 +14,39 @@ namespace FlexFlow {
 
 OpCostEstimateKey get_mapped_op_cost_estimate_key_for_layer(
     ParallelComputationGraph const &pcg,
-    parallel_layer_guid_t const &layer,
+    OptimizerAttrs const &optimizer_attrs,
+    parallel_layer_guid_t const &parallel_layer_guid,
     MachineView const &machine_view) {
   return map_unmapped_op_cost_estimate_key(
-      get_unmapped_op_cost_estimate_key_for_layer(pcg, layer), machine_view);
+      get_unmapped_op_cost_estimate_key_for_layer(
+          pcg, optimizer_attrs, parallel_layer_guid),
+      machine_view);
+}
+
+RuntimeOnlyOpCostEstimateKey runtime_only_from_op_cost_estimate_key(
+    OpCostEstimateKey const &op_cost_estimate_key) {
+
+  return RuntimeOnlyOpCostEstimateKey{
+      /*op_attrs=*/op_cost_estimate_key.op_attrs,
+      /*input_shapes=*/op_cost_estimate_key.input_shapes,
+      /*weight_shapes=*/op_cost_estimate_key.weight_shapes,
+      /*output_shapes=*/op_cost_estimate_key.output_shapes,
+      /*machine_view=*/op_cost_estimate_key.machine_view,
+  };
+}
+
+OpCostEstimateKey make_op_cost_estimate_key_from_runtime_only(
+    RuntimeOnlyOpCostEstimateKey const &runtime_only,
+    OptimizerAttrs const &optimizer_attrs) {
+
+  return OpCostEstimateKey{
+      /*op_attrs=*/runtime_only.op_attrs,
+      /*input_shapes=*/runtime_only.input_shapes,
+      /*weight_shapes=*/runtime_only.weight_shapes,
+      /*output_shapes=*/runtime_only.output_shapes,
+      /*optimizer_attrs=*/optimizer_attrs,
+      /*machine_view=*/runtime_only.machine_view,
+  };
 }
 
 } // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/cost_estimator/op_cost_metrics.cc b/lib/compiler/src/compiler/cost_estimator/op_cost_metrics.cc
new file mode 100644
index 0000000000..2bca184419
--- /dev/null
+++ b/lib/compiler/src/compiler/cost_estimator/op_cost_metrics.cc
@@ -0,0 +1,16 @@
+#include "compiler/cost_estimator/op_cost_metrics.h"
+
+namespace FlexFlow {
+
+OpCostMetrics make_op_cost_metrics_from_runtime_only(
+    RuntimeOnlyOpCostMetrics const &runtime_only,
+    num_bytes_t const &memory_usage) {
+
+  return OpCostMetrics{
+      /*forward_runtime=*/runtime_only.forward_runtime,
+      /*backward_runtime=*/runtime_only.backward_runtime,
+      /*memory_usage=*/memory_usage,
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/cost_estimator/runtime_only_cost_estimator.cc b/lib/compiler/src/compiler/cost_estimator/runtime_only_cost_estimator.cc
new file mode 100644
index 0000000000..4dcb4c33fe
--- /dev/null
+++ b/lib/compiler/src/compiler/cost_estimator/runtime_only_cost_estimator.cc
@@ -0,0 +1,19 @@
+#include "compiler/cost_estimator/runtime_only_cost_estimator.h"
+
+namespace FlexFlow {
+
+RuntimeOnlyCostEstimator::RuntimeOnlyCostEstimator(
+    std::shared_ptr<IRuntimeOnlyCostEstimator> implementation_ptr)
+    : implementation_ptr(implementation_ptr) {}
+
+RuntimeOnlyOpCostMetrics RuntimeOnlyCostEstimator::estimate_cost(
+    RuntimeOnlyOpCostEstimateKey const &k) const {
+  return this->implementation_ptr->estimate_cost(k);
+}
+
+milliseconds_t
+    RuntimeOnlyCostEstimator::estimate_cost(TensorSetMovement const &m) const {
+  return this->implementation_ptr->estimate_cost(m);
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/cost_estimator/runtime_only_cost_estimator_from_cost_estimator.cc b/lib/compiler/src/compiler/cost_estimator/runtime_only_cost_estimator_from_cost_estimator.cc
new file mode 100644
index 0000000000..74099e115c
--- /dev/null
+++ b/lib/compiler/src/compiler/cost_estimator/runtime_only_cost_estimator_from_cost_estimator.cc
@@ -0,0 +1,45 @@
+#include "compiler/cost_estimator/runtime_only_cost_estimator_from_cost_estimator.h"
+#include "compiler/cost_estimator/op_cost_estimate_key.h"
+#include "compiler/cost_estimator/runtime_only_op_cost_metrics.h"
+
+namespace FlexFlow {
+
+RuntimeOnlyCostEstimatorFromCostEstimator::
+    RuntimeOnlyCostEstimatorFromCostEstimator(
+        CostEstimator const &cost_estimator)
+    : cost_estimator(cost_estimator) {}
+
+RuntimeOnlyOpCostMetrics
+    RuntimeOnlyCostEstimatorFromCostEstimator::estimate_cost(
+        RuntimeOnlyOpCostEstimateKey const &runtime_only) const {
+  OptimizerAttrs fake_optimizer_attrs = OptimizerAttrs{
+      SGDOptimizerAttrs{
+          /*lr=*/0.0,
+          /*momentum=*/0.0,
+          /*nesterov=*/false,
+          /*weight_decay=*/0.0,
+      },
+  };
+
+  OpCostEstimateKey op_cost_estimate_key =
+      make_op_cost_estimate_key_from_runtime_only(runtime_only,
+                                                  fake_optimizer_attrs);
+
+  OpCostMetrics op_cost_metrics =
+      this->cost_estimator.estimate_cost(op_cost_estimate_key);
+
+  return runtime_only_from_op_cost_metrics(op_cost_metrics);
+}
+
+milliseconds_t RuntimeOnlyCostEstimatorFromCostEstimator::estimate_cost(
+    TensorSetMovement const &tensor_set_movement) const {
+  return this->cost_estimator.estimate_cost(tensor_set_movement);
+}
+
+RuntimeOnlyCostEstimator runtime_only_cost_estimator_from_cost_estimator(
+    CostEstimator const &cost_estimator) {
+  return RuntimeOnlyCostEstimator::create<
+      RuntimeOnlyCostEstimatorFromCostEstimator>(cost_estimator);
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/cost_estimator/runtime_only_op_cost_estimate_key.cc b/lib/compiler/src/compiler/cost_estimator/runtime_only_op_cost_estimate_key.cc
new file mode 100644
index 0000000000..0c097b165e
--- /dev/null
+++ b/lib/compiler/src/compiler/cost_estimator/runtime_only_op_cost_estimate_key.cc
@@ -0,0 +1,17 @@
+#include "compiler/cost_estimator/runtime_only_op_cost_estimate_key.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.h"
+
+namespace FlexFlow {
+
+RuntimeOnlyOpCostEstimateKey
+    get_mapped_runtime_only_op_cost_estimate_key_for_layer(
+        ParallelComputationGraph const &pcg,
+        parallel_layer_guid_t const &parallel_layer_guid,
+        MachineView const &machine_view) {
+  return map_unmapped_runtime_only_op_cost_estimate_key(
+      get_unmapped_runtime_only_op_cost_estimate_key_for_layer(
+          pcg, parallel_layer_guid),
+      machine_view);
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/cost_estimator/runtime_only_op_cost_metrics.cc b/lib/compiler/src/compiler/cost_estimator/runtime_only_op_cost_metrics.cc
new file mode 100644
index 0000000000..4cfd864de5
--- /dev/null
+++ b/lib/compiler/src/compiler/cost_estimator/runtime_only_op_cost_metrics.cc
@@ -0,0 +1,14 @@
+#include "compiler/cost_estimator/runtime_only_op_cost_metrics.h"
+
+namespace FlexFlow {
+
+RuntimeOnlyOpCostMetrics
+    runtime_only_from_op_cost_metrics(OpCostMetrics const &op_cost_metrics) {
+
+  return RuntimeOnlyOpCostMetrics{
+      /*forward_runtime=*/op_cost_metrics.forward_runtime,
+      /*backward_runtime=*/op_cost_metrics.backward_runtime,
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
index 49d528e4ab..8ca033d0d6 100644
--- a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
+++ b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
@@ -6,6 +6,7 @@
 #include "compiler/machine_mapping/machine_mapping_constraints.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.h"
 #include "compiler/machine_mapping/machine_mapping_result.h"
 #include "compiler/machine_mapping/transitive_reduced_pcg.h"
 #include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.dtg.h"
@@ -86,11 +87,11 @@ MachineMappingResult
         allowed = generate_map(
             boundary_layers,
             [&](BinaryTreePath const &l) -> std::unordered_set<MachineView> {
-              UnmappedOpCostEstimateKey leaf =
+              UnmappedRuntimeOnlyOpCostEstimateKey leaf =
                   mm_problem_tree_get_subtree_at_path(
                       MachineMappingProblemTree{series_split}, l)
                       .value()
-                      .get<UnmappedOpCostEstimateKey>();
+                      .get<UnmappedRuntimeOnlyOpCostEstimateKey>();
               return context.allowed_machine_views(leaf, resources);
             });
     return transform(
@@ -156,7 +157,7 @@ MachineMappingResult
               tensor_movement,
               /*pre_mapping=*/assigned_pre_machine_views,
               /*post_mapping=*/assigned_post_machine_views);
-      float cost_across_split =
+      milliseconds_t cost_across_split =
           context.cost_estimator.estimate_cost(comm_across_split);
 
       result = minimize_runtime(result,
@@ -222,12 +223,12 @@ MachineMappingResult get_optimal_machine_mapping(
                           get_mapping_with_minimal_runtime(parallel_results));
 }
 
-MachineMappingResult
-    get_optimal_machine_mapping(MachineMappingCache &result_cache,
-                                MachineMappingContext const &context,
-                                UnmappedOpCostEstimateKey const &leaf,
-                                MachineSpecification const &resource,
-                                MachineMappingConstraints const &constraints) {
+MachineMappingResult get_optimal_machine_mapping(
+    MachineMappingCache &result_cache,
+    MachineMappingContext const &context,
+    UnmappedRuntimeOnlyOpCostEstimateKey const &leaf,
+    MachineSpecification const &resource,
+    MachineMappingConstraints const &constraints) {
 
   std::unordered_set<MachineView> candidates = [&] {
     std::optional<MachineView> machine_view = require_only_root(constraints);
@@ -239,10 +240,11 @@ MachineMappingResult
   }();
 
   auto get_mapping_result = [&](MachineView const &machine_view) {
-    OpCostEstimateKey mapped =
-        map_unmapped_op_cost_estimate_key(leaf, machine_view);
-    OpCostMetrics metrics = context.cost_estimator.estimate_cost(mapped);
-    float cost = metrics.forward_runtime + metrics.backward_runtime;
+    RuntimeOnlyOpCostEstimateKey mapped =
+        map_unmapped_runtime_only_op_cost_estimate_key(leaf, machine_view);
+    RuntimeOnlyOpCostMetrics metrics =
+        context.cost_estimator.estimate_cost(mapped);
+    milliseconds_t cost = metrics.forward_runtime + metrics.backward_runtime;
     return make_singleton_machine_mapping_result(cost, machine_view);
   };
 
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
index 367af3701e..da6b7b91e5 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
@@ -1,7 +1,7 @@
 #include "compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h"
 #include "compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h"
-#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.h"
 #include "compiler/machine_mapping/transitive_reduced_pcg.h"
 #include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
@@ -41,7 +41,8 @@ MachineMappingProblemTree get_machine_mapping_problem_tree(
         },
         [&](parallel_layer_guid_t const &leaf) {
           return MachineMappingProblemTree{
-              get_unmapped_op_cost_estimate_key_for_layer(pcg, leaf),
+              get_unmapped_runtime_only_op_cost_estimate_key_for_layer(pcg,
+                                                                       leaf),
           };
         },
     });
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.cc
index 1e39a7be19..09323b1800 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.cc
@@ -5,16 +5,17 @@
 
 namespace FlexFlow {
 
-GenericBinarySPDecompositionTreeImplementation<MachineMappingProblemTree,
-                                               MMProblemTreeSeriesSplit,
-                                               MMProblemTreeParallelSplit,
-                                               UnmappedOpCostEstimateKey>
+GenericBinarySPDecompositionTreeImplementation<
+    MachineMappingProblemTree,
+    MMProblemTreeSeriesSplit,
+    MMProblemTreeParallelSplit,
+    UnmappedRuntimeOnlyOpCostEstimateKey>
     generic_binary_sp_impl_for_mm_problem_tree() {
   return GenericBinarySPDecompositionTreeImplementation<
       MachineMappingProblemTree,
       MMProblemTreeSeriesSplit,
       MMProblemTreeParallelSplit,
-      UnmappedOpCostEstimateKey>{
+      UnmappedRuntimeOnlyOpCostEstimateKey>{
       /*series_get_left_child=*/[](MMProblemTreeSeriesSplit const &split)
                                     -> MachineMappingProblemTree const & {
         return split.get_left_child();
@@ -50,8 +51,8 @@ GenericBinarySPDecompositionTreeImplementation<MachineMappingProblemTree,
       },
       /*require_leaf=*/
       [](MachineMappingProblemTree const &tree)
-          -> UnmappedOpCostEstimateKey const & {
-        return tree.get<UnmappedOpCostEstimateKey>();
+          -> UnmappedRuntimeOnlyOpCostEstimateKey const & {
+        return tree.get<UnmappedRuntimeOnlyOpCostEstimateKey>();
       },
   };
 }
@@ -65,13 +66,13 @@ SPDecompositionTreeNodeType
       [](MMProblemTreeParallelSplit const &) {
         return SPDecompositionTreeNodeType::PARALLEL;
       },
-      [](UnmappedOpCostEstimateKey const &) {
+      [](UnmappedRuntimeOnlyOpCostEstimateKey const &) {
         return SPDecompositionTreeNodeType::NODE;
       },
   });
 }
 
-std::unordered_multiset<UnmappedOpCostEstimateKey>
+std::unordered_multiset<UnmappedRuntimeOnlyOpCostEstimateKey>
     get_leaves(MachineMappingProblemTree const &tree) {
   return get_leaves(tree, generic_binary_sp_impl_for_mm_problem_tree());
 }
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc
index 990b287f8b..7659467b6e 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc
@@ -1,23 +1,39 @@
 #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
 #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
 
 namespace FlexFlow {
 
 UnmappedOpCostEstimateKey get_unmapped_op_cost_estimate_key_for_layer(
-    ParallelComputationGraph const &pcg, parallel_layer_guid_t const &layer) {
-  auto get_tensor_shape = [&](parallel_tensor_guid_t const &t) {
-    return get_parallel_tensor_shape(pcg, t);
-  };
+    ParallelComputationGraph const &pcg,
+    OptimizerAttrs const &optimizer_attrs,
+    parallel_layer_guid_t const &layer) {
+  return unmapped_op_cost_estimate_key_from_runtime_only(
+      get_unmapped_runtime_only_op_cost_estimate_key_for_layer(pcg, layer),
+      optimizer_attrs);
+}
 
+UnmappedOpCostEstimateKey unmapped_op_cost_estimate_key_from_runtime_only(
+    UnmappedRuntimeOnlyOpCostEstimateKey const &runtime_only,
+    OptimizerAttrs const &optimizer_attrs) {
   return UnmappedOpCostEstimateKey{
-      /*op_attrs=*/pcg_get_op_attrs(pcg, layer),
-      /*input_shapes=*/
-      transform(get_incoming_inputs(pcg, layer), get_tensor_shape),
-      /*weight_shapes=*/
-      transform(get_incoming_weights(pcg, layer), get_tensor_shape),
-      /*output_shapes=*/
-      transform(get_layer_outputs(pcg, layer), get_tensor_shape),
+      /*op_attrs=*/runtime_only.op_attrs,
+      /*input_shapes=*/runtime_only.input_shapes,
+      /*weight_shapes=*/runtime_only.weight_shapes,
+      /*output_shapes=*/runtime_only.output_shapes,
+      /*optimizer_attrs=*/optimizer_attrs,
+  };
+}
+
+UnmappedRuntimeOnlyOpCostEstimateKey
+    runtime_only_from_unmapped_op_cost_estimate_key(
+        UnmappedOpCostEstimateKey const &unmapped_op_cost_estimate_key) {
+  return UnmappedRuntimeOnlyOpCostEstimateKey{
+      /*op_attrs=*/unmapped_op_cost_estimate_key.op_attrs,
+      /*input_shapes=*/unmapped_op_cost_estimate_key.input_shapes,
+      /*weight_shapes=*/unmapped_op_cost_estimate_key.weight_shapes,
+      /*output_shapes=*/unmapped_op_cost_estimate_key.output_shapes,
   };
 }
 
@@ -29,6 +45,7 @@ OpCostEstimateKey
       /*input_shapes=*/unmapped.input_shapes,
       /*weight_shapes=*/unmapped.weight_shapes,
       /*output_shapes=*/unmapped.output_shapes,
+      /*optimizer_attrs=*/unmapped.optimizer_attrs,
       /*machine_view=*/machine_view,
   };
 }
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.cc
new file mode 100644
index 0000000000..53155a9a9b
--- /dev/null
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.cc
@@ -0,0 +1,39 @@
+#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+
+namespace FlexFlow {
+
+UnmappedRuntimeOnlyOpCostEstimateKey
+    get_unmapped_runtime_only_op_cost_estimate_key_for_layer(
+        ParallelComputationGraph const &pcg,
+        parallel_layer_guid_t const &parallel_layer_guid) {
+  auto get_tensor_shape = [&](parallel_tensor_guid_t const &t) {
+    return get_parallel_tensor_shape(pcg, t);
+  };
+
+  return UnmappedRuntimeOnlyOpCostEstimateKey{
+      /*op_attrs=*/pcg_get_op_attrs(pcg, parallel_layer_guid),
+      /*input_shapes=*/
+      transform(get_incoming_inputs(pcg, parallel_layer_guid),
+                get_tensor_shape),
+      /*weight_shapes=*/
+      transform(get_incoming_weights(pcg, parallel_layer_guid),
+                get_tensor_shape),
+      /*output_shapes=*/
+      transform(get_layer_outputs(pcg, parallel_layer_guid), get_tensor_shape),
+  };
+}
+
+RuntimeOnlyOpCostEstimateKey map_unmapped_runtime_only_op_cost_estimate_key(
+    UnmappedRuntimeOnlyOpCostEstimateKey const &unmapped,
+    MachineView const &machine_view) {
+  return RuntimeOnlyOpCostEstimateKey{
+      /*op_attrs=*/unmapped.op_attrs,
+      /*input_shapes=*/unmapped.input_shapes,
+      /*weight_shapes=*/unmapped.weight_shapes,
+      /*output_shapes=*/unmapped.output_shapes,
+      /*machine_view=*/machine_view,
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc
index 3409f7f871..a370a6803d 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc
@@ -32,7 +32,7 @@ FeasibleMachineMappingResult
 }
 
 MachineMappingResult
-    series_combine(float comm_cost,
+    series_combine(milliseconds_t comm_cost,
                    MachineMappingResult const &maybe_pre_result,
                    MachineMappingResult const &maybe_post_result,
                    std::optional<ParallelSplitTransformation> const
@@ -122,7 +122,7 @@ MachineMappingResult minimize_runtime(MachineMappingResult const &maybe_m1,
 }
 
 MachineMappingResult
-    make_singleton_machine_mapping_result(float runtime,
+    make_singleton_machine_mapping_result(milliseconds_t runtime,
                                           MachineView const &machine_view) {
   return MachineMappingResult{
       FeasibleMachineMappingResult{
diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
index b67083e8cd..74e8db6304 100644
--- a/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
+++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
@@ -26,7 +26,7 @@ namespace FlexFlow {
 
 MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
     MachineMappingWithMemoryCache &result_cache,
-    MachineMappingContext const &context,
+    MachineMappingWithMemoryContext const &context,
     MachineMappingProblemTree const &problem_tree,
     MachineSpecification const &resources,
     MachineMappingConstraints const &constraints) {
@@ -71,7 +71,7 @@ MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
 
 MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
     MachineMappingWithMemoryCache &result_cache,
-    MachineMappingContext const &context,
+    MachineMappingWithMemoryContext const &context,
     MMProblemTreeSeriesSplit const &series_split,
     MachineSpecification const &resources,
     MachineMappingConstraints const &constraints,
@@ -85,11 +85,11 @@ MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
         allowed = generate_map(
             boundary_layers,
             [&](BinaryTreePath const &l) -> std::unordered_set<MachineView> {
-              UnmappedOpCostEstimateKey leaf =
+              UnmappedRuntimeOnlyOpCostEstimateKey leaf =
                   mm_problem_tree_get_subtree_at_path(
                       MachineMappingProblemTree{series_split}, l)
                       .value()
-                      .get<UnmappedOpCostEstimateKey>();
+                      .get<UnmappedRuntimeOnlyOpCostEstimateKey>();
               return context.allowed_machine_views(leaf, resources);
             });
     return transform(
@@ -158,7 +158,7 @@ MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
               tensor_movement,
               /*pre_mapping=*/assigned_pre_machine_views,
               /*post_mapping=*/assigned_post_machine_views);
-      float cost_across_split =
+      milliseconds_t cost_across_split =
           context.cost_estimator.estimate_cost(comm_across_split);
 
       result = minimize_runtime(result,
@@ -174,7 +174,7 @@ MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
 
 MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
     MachineMappingWithMemoryCache &result_cache,
-    MachineMappingContext const &context,
+    MachineMappingWithMemoryContext const &context,
     MMProblemTreeParallelSplit const &parallel_split,
     MachineSpecification const &resources,
     MachineMappingConstraints const &constraints) {
@@ -232,8 +232,8 @@ MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
 
 MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
     MachineMappingWithMemoryCache &result_cache,
-    MachineMappingContext const &context,
-    UnmappedOpCostEstimateKey const &leaf,
+    MachineMappingWithMemoryContext const &context,
+    UnmappedRuntimeOnlyOpCostEstimateKey const &leaf,
     MachineSpecification const &resource,
     MachineMappingConstraints const &constraints) {
 
@@ -247,8 +247,10 @@ MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory(
   }();
 
   auto get_mapping_result = [&](MachineView const &machine_view) {
-    OpCostEstimateKey mapped =
-        map_unmapped_op_cost_estimate_key(leaf, machine_view);
+    OpCostEstimateKey mapped = map_unmapped_op_cost_estimate_key(
+        unmapped_op_cost_estimate_key_from_runtime_only(
+            leaf, context.optimizer_attrs),
+        machine_view);
     OpCostMetrics cost = context.cost_estimator.estimate_cost(mapped);
 
     return make_singleton_machine_mapping_with_memory_result(cost,
diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc
index 9b4a1fd6fe..cff7984897 100644
--- a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc
+++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc
@@ -33,7 +33,7 @@ MachineMappingWithMemoryResult remove_non_pareto_optimal_machine_mapping_result(
       if (mapping.cost.forward_runtime >= other_mapping.cost.forward_runtime &&
           mapping.cost.backward_runtime >=
               other_mapping.cost.backward_runtime &&
-          mapping.cost.memory >= other_mapping.cost.memory &&
+          mapping.cost.memory_usage >= other_mapping.cost.memory_usage &&
           mapping != other_mapping) {
         is_pareto_optimal = false;
         break;
@@ -47,7 +47,7 @@ MachineMappingWithMemoryResult remove_non_pareto_optimal_machine_mapping_result(
 }
 
 MachineMappingWithMemoryResult
-    series_combine(float comm_cost,
+    series_combine(milliseconds_t comm_cost,
                    MachineMappingWithMemoryResult const &pre_result,
                    MachineMappingWithMemoryResult const &post_result,
                    std::optional<ParallelSplitTransformation> const
@@ -56,11 +56,12 @@ MachineMappingWithMemoryResult
       [&](MachineMappingForSingleLayer const &pre_mm,
           MachineMappingForSingleLayer const &post_mm) {
         OpCostMetrics cost = OpCostMetrics{
-            pre_mm.cost.forward_runtime + comm_cost +
+            /*forward_runtime=*/pre_mm.cost.forward_runtime + comm_cost +
                 post_mm.cost.forward_runtime,
-            pre_mm.cost.backward_runtime + comm_cost +
+            /*backward_runtime=*/pre_mm.cost.backward_runtime + comm_cost +
                 post_mm.cost.backward_runtime,
-            pre_mm.cost.memory + post_mm.cost.memory,
+            /*memory_usage=*/pre_mm.cost.memory_usage +
+                post_mm.cost.memory_usage,
         };
 
         ParallelLayerGuidObliviousMachineMapping mapping = [&] {
@@ -98,10 +99,13 @@ MachineMappingWithMemoryResult
       [&](MachineMappingForSingleLayer const &lhs_mm,
           MachineMappingForSingleLayer const &rhs_mm) {
         OpCostMetrics cost = OpCostMetrics{
+            /*forward_runtime=*/
             std::max(lhs_mm.cost.forward_runtime, rhs_mm.cost.forward_runtime),
+            /*backward_runtime=*/
             std::max(lhs_mm.cost.backward_runtime,
-                     rhs_mm.cost.backward_runtime), //(@wmdi) is this correct?
-            std::max(lhs_mm.cost.memory, rhs_mm.cost.memory),
+                     rhs_mm.cost.backward_runtime),
+            /*memory_usage=*/
+            std::max(lhs_mm.cost.memory_usage, rhs_mm.cost.memory_usage),
         };
 
         ParallelLayerGuidObliviousMachineMapping mapping =
diff --git a/lib/compiler/src/compiler/task_graph_simulator/pcg_task_graph.cc b/lib/compiler/src/compiler/task_graph_simulator/pcg_task_graph.cc
index 539c44a963..c072b0e61e 100644
--- a/lib/compiler/src/compiler/task_graph_simulator/pcg_task_graph.cc
+++ b/lib/compiler/src/compiler/task_graph_simulator/pcg_task_graph.cc
@@ -1,5 +1,5 @@
 #include "compiler/task_graph_simulator/pcg_task_graph.h"
-#include "compiler/cost_estimator/op_cost_estimate_key.h"
+#include "compiler/cost_estimator/runtime_only_op_cost_estimate_key.h"
 #include "compiler/cost_estimator/tensor_set_movement.h"
 #include "compiler/machine_mapping/machine_mapping.dtg.h"
 #include "pcg/device_id_t.dtg.h"
@@ -28,8 +28,8 @@ PCGTaskGraph get_pcg_task_graph(ParallelComputationGraph const &pcg,
 
   for (parallel_layer_guid_t const &layer : get_parallel_layers(pcg)) {
     MachineView mv = machine_mapping.machine_views.at(layer);
-    OpCostEstimateKey op_key =
-        get_mapped_op_cost_estimate_key_for_layer(pcg, layer, mv);
+    RuntimeOnlyOpCostEstimateKey op_key =
+        get_mapped_runtime_only_op_cost_estimate_key_for_layer(pcg, layer, mv);
     Node node = digraph.add_node();
     node_to_task.equate(node, PCGTask{op_key});
     node_to_layer.equate(node, layer);
diff --git a/lib/compiler/src/compiler/task_graph_simulator/task_graph_execution_trace.cc b/lib/compiler/src/compiler/task_graph_simulator/task_graph_execution_trace.cc
index 716a7afe15..1e15931174 100644
--- a/lib/compiler/src/compiler/task_graph_simulator/task_graph_execution_trace.cc
+++ b/lib/compiler/src/compiler/task_graph_simulator/task_graph_execution_trace.cc
@@ -8,10 +8,8 @@
 namespace FlexFlow {
 
 float get_total_execution_time(TaskGraphExecutionTrace const &trace) {
-  if (trace.task_profiles.empty()) {
-    throw mk_runtime_error(
-        fmt::format("TaskGraphExecutionTrace {} is empty", trace));
-  }
+  ASSERT(!trace.task_profiles.empty());
+
   float end_time =
       maximum(transform(trace.task_profiles, [](TaskProfile const &profile) {
         return profile.end_time;
diff --git a/lib/compiler/src/compiler/task_graph_simulator/task_simulator.cc b/lib/compiler/src/compiler/task_graph_simulator/task_simulator.cc
index ab204e7d71..a1aa53885b 100644
--- a/lib/compiler/src/compiler/task_graph_simulator/task_simulator.cc
+++ b/lib/compiler/src/compiler/task_graph_simulator/task_simulator.cc
@@ -19,9 +19,9 @@
 
 namespace FlexFlow {
 
-float task_simulator_estimate_forward_pass_time(
+milliseconds_t task_simulator_estimate_forward_pass_time(
     ParallelComputationGraph const &pcg,
-    CostEstimator const &estimator,
+    RuntimeOnlyCostEstimator const &estimator,
     MachineMapping const &machine_mapping,
     MachineSpecification const &machine_spec) {
 
@@ -30,11 +30,16 @@ float task_simulator_estimate_forward_pass_time(
 
   auto cost_function = [&](Node const &node) -> float {
     PCGTask task = task_graph.node_to_task.at_l(node);
-    if (task.is_operator()) {
-      return estimator.estimate_cost(task.require_operator()).forward_runtime;
-    } else {
-      return estimator.estimate_cost(task.require_tensor_movement());
-    }
+
+    milliseconds_t running_time = [&] {
+      if (task.is_operator()) {
+        return estimator.estimate_cost(task.require_operator()).forward_runtime;
+      } else {
+        return estimator.estimate_cost(task.require_tensor_movement());
+      }
+    }();
+
+    return running_time.unwrap_milliseconds();
   };
 
   auto is_allowed_to_run =
@@ -64,8 +69,8 @@ float task_simulator_estimate_forward_pass_time(
   TaskExecutionConstraint constraint =
       TaskExecutionConstraint{is_allowed_to_run};
 
-  return get_total_execution_time(simulate_task_graph_execution(
-      task_graph.graph, cost_function, constraint));
+  return milliseconds_t{get_total_execution_time(simulate_task_graph_execution(
+      task_graph.graph, cost_function, constraint))};
 }
 
 } // namespace FlexFlow
diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
index c3342c1b3a..2cbc87cffe 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
@@ -1,10 +1,13 @@
 #include "compiler/machine_mapping/get_optimal_machine_mapping.h"
-#include "../cost_estimator_for_test.h"
+#include "compiler/cost_estimator/runtime_only_op_cost_estimate_key.dtg.h"
+#include "compiler/cost_estimator/runtime_only_op_cost_metrics.dtg.h"
 #include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h"
 #include "compiler/machine_mapping/machine_mapping_cache.h"
 #include "compiler/machine_mapping/machine_mapping_constraints.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.h"
+#include "internal/runtime_only_cost_estimator_for_test.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "pcg/machine_view.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
@@ -17,7 +20,7 @@ using namespace FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_optimal_machine_mapping") {
-    auto make_leaf = [](UnmappedOpCostEstimateKey const &k) {
+    auto make_leaf = [](UnmappedRuntimeOnlyOpCostEstimateKey const &k) {
       return MachineMappingProblemTree{k};
     };
 
@@ -90,14 +93,15 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*intra_node_bandwidth=*/1,
     };
 
-    auto allowed_machine_views1 = [&](UnmappedOpCostEstimateKey const &,
-                                      MachineSpecification const &resources) {
-      if (resources == full_machine_spec) {
-        return std::unordered_set<MachineView>{mv1, mv2};
-      } else {
-        return std::unordered_set<MachineView>{mv2};
-      }
-    };
+    auto allowed_machine_views1 =
+        [&](UnmappedRuntimeOnlyOpCostEstimateKey const &,
+            MachineSpecification const &resources) {
+          if (resources == full_machine_spec) {
+            return std::unordered_set<MachineView>{mv1, mv2};
+          } else {
+            return std::unordered_set<MachineView>{mv2};
+          }
+        };
 
     TensorShape tensor_shape = TensorShape{
         TensorDims{
@@ -109,24 +113,26 @@ TEST_SUITE(FF_TEST_SUITE) {
         DataType::FLOAT,
     };
 
-    UnmappedOpCostEstimateKey k1 = UnmappedOpCostEstimateKey{
-        /*op_attrs=*/PCGOperatorAttrs{InputAttrs{tensor_shape}},
-        /*input_shapes=*/{},
-        /*weight_shapes=*/{},
-        /*output_shapes=*/{},
-    };
+    UnmappedRuntimeOnlyOpCostEstimateKey k1 =
+        UnmappedRuntimeOnlyOpCostEstimateKey{
+            /*op_attrs=*/PCGOperatorAttrs{InputAttrs{tensor_shape}},
+            /*input_shapes=*/{},
+            /*weight_shapes=*/{},
+            /*output_shapes=*/{},
+        };
 
-    UnmappedOpCostEstimateKey k2 = UnmappedOpCostEstimateKey{
-        /*op_attrs=*/PCGOperatorAttrs{ElementBinaryAttrs{
-            /*type=*/OperatorType::EW_ADD,
-            /*compute_type=*/DataType::FLOAT,
-            /*should_broadcast_lhs=*/false,
-            /*should_broadcast_rhs=*/false,
-        }},
-        /*input_shapes=*/{},
-        /*weight_shapes=*/{},
-        /*output_shapes=*/{},
-    };
+    UnmappedRuntimeOnlyOpCostEstimateKey k2 =
+        UnmappedRuntimeOnlyOpCostEstimateKey{
+            /*op_attrs=*/PCGOperatorAttrs{ElementBinaryAttrs{
+                /*type=*/OperatorType::EW_ADD,
+                /*compute_type=*/DataType::FLOAT,
+                /*should_broadcast_lhs=*/false,
+                /*should_broadcast_rhs=*/false,
+            }},
+            /*input_shapes=*/{},
+            /*weight_shapes=*/{},
+            /*output_shapes=*/{},
+        };
 
     ParallelTensorShape par_tensor_shape = lift_to_parallel(tensor_shape);
 
@@ -147,41 +153,39 @@ TEST_SUITE(FF_TEST_SUITE) {
             {binary_tree_root_path(), mv2},
         }};
 
-    auto map1 = std::unordered_map<OpCostEstimateKey, OpCostMetrics>{{
-        {map_unmapped_op_cost_estimate_key(k1, mv1),
-         OpCostMetrics{/*forward_runtime=*/0.5,
-                       /*backward_runtime=*/0.5,
-                       /*memory=*/nonnegative_int{0}}},
-        {map_unmapped_op_cost_estimate_key(k2, mv1),
-         OpCostMetrics{/*forward_runtime=*/1.0,
-                       /*backward_runtime=*/1.0,
-                       /*memory=*/nonnegative_int{0}}},
-        {map_unmapped_op_cost_estimate_key(k1, mv2),
-         OpCostMetrics{/*forward_runtime=*/0.75,
-                       /*backward_runtime=*/0.75,
-                       /*memory=*/nonnegative_int{0}}},
-        {map_unmapped_op_cost_estimate_key(k2, mv2),
-         OpCostMetrics{/*forward_runtime=*/1.25,
-                       /*backward_runtime=*/1.25,
-                       /*memory=*/nonnegative_int{0}}},
+    auto map1 = std::unordered_map<RuntimeOnlyOpCostEstimateKey,
+                                   RuntimeOnlyOpCostMetrics>{{
+        {map_unmapped_runtime_only_op_cost_estimate_key(k1, mv1),
+         RuntimeOnlyOpCostMetrics{/*forward_runtime=*/0.5_ms,
+                                  /*backward_runtime=*/0.5_ms}},
+        {map_unmapped_runtime_only_op_cost_estimate_key(k2, mv1),
+         RuntimeOnlyOpCostMetrics{/*forward_runtime=*/1.0_ms,
+                                  /*backward_runtime=*/1.0_ms}},
+        {map_unmapped_runtime_only_op_cost_estimate_key(k1, mv2),
+         RuntimeOnlyOpCostMetrics{/*forward_runtime=*/0.75_ms,
+                                  /*backward_runtime=*/0.75_ms}},
+        {map_unmapped_runtime_only_op_cost_estimate_key(k2, mv2),
+         RuntimeOnlyOpCostMetrics{/*forward_runtime=*/1.25_ms,
+                                  /*backward_runtime=*/1.25_ms}},
     }};
 
-    CostEstimator cost_estimator = make_fake_cost_estimator(
-        map1,
-        std::unordered_map<TensorSetMovement, float>{{
-            {TensorSetMovement{{}}, 0.0},
-            {concretize_abstracted_tensor_set_movement(movement1, mm1, mm1),
-             0.1},
-            {concretize_abstracted_tensor_set_movement(movement1, mm2, mm2),
-             0.2},
-            {concretize_abstracted_tensor_set_movement(movement1, mm1, mm2),
-             0.3},
-            {concretize_abstracted_tensor_set_movement(movement1, mm2, mm1),
-             0.4},
-        }});
+    RuntimeOnlyCostEstimator runtime_only_cost_estimator =
+        make_fake_runtime_only_cost_estimator(
+            map1,
+            std::unordered_map<TensorSetMovement, milliseconds_t>{{
+                {TensorSetMovement{{}}, 0.0_ms},
+                {concretize_abstracted_tensor_set_movement(movement1, mm1, mm1),
+                 0.1_ms},
+                {concretize_abstracted_tensor_set_movement(movement1, mm2, mm2),
+                 0.2_ms},
+                {concretize_abstracted_tensor_set_movement(movement1, mm1, mm2),
+                 0.3_ms},
+                {concretize_abstracted_tensor_set_movement(movement1, mm2, mm1),
+                 0.4_ms},
+            }});
 
     MachineMappingContext context = MachineMappingContext{
-        cost_estimator,
+        runtime_only_cost_estimator,
         allowed_machine_views1,
     };
 
@@ -198,7 +202,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           cache, context, problem_tree, full_machine_spec, constraints);
       MachineMappingResult correct = MachineMappingResult{
           FeasibleMachineMappingResult{
-              /*runtime=*/1.0,
+              /*runtime=*/1.0_ms,
               /*machine_mapping=*/
               ParallelLayerGuidObliviousMachineMapping{{
                   {binary_tree_root_path(), mv1},
@@ -221,7 +225,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           cache, context, problem_tree, full_machine_spec, constraints);
       MachineMappingResult correct = MachineMappingResult{
           FeasibleMachineMappingResult{
-              /*runtime=*/1.0 + 2.0 + 0.1,
+              /*runtime=*/1.0_ms + 2.0_ms + 0.1_ms,
               /*machine_mapping=*/
               ParallelLayerGuidObliviousMachineMapping{{
                   {
@@ -255,7 +259,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           cache, context, problem_tree, full_machine_spec, constraints);
       MachineMappingResult correct = MachineMappingResult{
           FeasibleMachineMappingResult{
-              /*runtime=*/2.5,
+              /*runtime=*/2.5_ms,
               /*machine_mapping=*/
               ParallelLayerGuidObliviousMachineMapping{{
                   {
diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc b/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc
index c5b68e3a76..586a2b7764 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc
@@ -1,6 +1,6 @@
 #include "compiler/machine_mapping/get_tensor_set_movement_across_split.h"
-#include "../cost_estimator_for_test.h"
 #include "compiler/machine_mapping/transitive_reduced_pcg.h"
+#include "internal/cost_estimator_for_test.h"
 #include "pcg/machine_view.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
index d2c829df30..2fcffac29a 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
@@ -1,5 +1,6 @@
 #include "compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.dtg.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
 #include "utils/containers/get_only.h"
@@ -33,9 +34,10 @@ TEST_SUITE(FF_TEST_SUITE) {
       };
     };
 
-    auto mm_problem_tree_make_leaf = [](UnmappedOpCostEstimateKey const &k) {
-      return MachineMappingProblemTree{k};
-    };
+    auto mm_problem_tree_make_leaf =
+        [](UnmappedRuntimeOnlyOpCostEstimateKey const &k) {
+          return MachineMappingProblemTree{k};
+        };
 
     auto mm_problem_tree_make_series =
         [](AbstractedTensorSetMovement const &tensor_set_movement,
@@ -92,7 +94,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto make_input_key =
         [&](ParallelTensorShape const &parallel_tensor_shape) {
-          return UnmappedOpCostEstimateKey{
+          return UnmappedRuntimeOnlyOpCostEstimateKey{
               /*op_attrs=*/input_attrs,
               /*input_shapes=*/{},
               /*weight_shapes=*/{},
@@ -108,7 +110,8 @@ TEST_SUITE(FF_TEST_SUITE) {
                              /*output_labels=*/{});
       parallel_layer_guid_t input_layer = input_added.parallel_layer;
 
-      UnmappedOpCostEstimateKey input_key = make_input_key(par_input_shape);
+      UnmappedRuntimeOnlyOpCostEstimateKey input_key =
+          make_input_key(par_input_shape);
 
       PCGBinarySPDecomposition sp_decomposition =
           PCGBinarySPDecomposition{input_layer};
@@ -129,7 +132,8 @@ TEST_SUITE(FF_TEST_SUITE) {
       parallel_layer_guid_t input_layer = input_added.parallel_layer;
       parallel_tensor_guid_t input = get_only(input_added.outputs);
 
-      UnmappedOpCostEstimateKey input_key = make_input_key(par_input_shape);
+      UnmappedRuntimeOnlyOpCostEstimateKey input_key =
+          make_input_key(par_input_shape);
 
       PCGOperatorAttrs relu_attrs = PCGOperatorAttrs{
           ElementUnaryAttrs{
@@ -143,12 +147,13 @@ TEST_SUITE(FF_TEST_SUITE) {
       parallel_layer_guid_t relu_layer = relu_added.parallel_layer;
       parallel_tensor_guid_t relu_output = get_only(relu_added.outputs);
 
-      UnmappedOpCostEstimateKey relu_key = UnmappedOpCostEstimateKey{
-          /*op_attrs=*/relu_attrs,
-          /*input_shapes=*/{par_input_shape},
-          /*weight_shapes=*/{},
-          /*output_shapes=*/{relu_output_shape},
-      };
+      UnmappedRuntimeOnlyOpCostEstimateKey relu_key =
+          UnmappedRuntimeOnlyOpCostEstimateKey{
+              /*op_attrs=*/relu_attrs,
+              /*input_shapes=*/{par_input_shape},
+              /*weight_shapes=*/{},
+              /*output_shapes=*/{relu_output_shape},
+          };
 
       PCGBinarySPDecomposition sp_decomposition = pcg_make_series(
           pcg_make_leaf(input_layer), pcg_make_leaf(relu_layer));
@@ -180,12 +185,14 @@ TEST_SUITE(FF_TEST_SUITE) {
       ParallelLayerAddedResult input1_added =
           pcg_add_input_layer(pcg, input_shape);
       parallel_layer_guid_t input1_layer = input1_added.parallel_layer;
-      UnmappedOpCostEstimateKey input1_key = make_input_key(par_input_shape);
+      UnmappedRuntimeOnlyOpCostEstimateKey input1_key =
+          make_input_key(par_input_shape);
 
       ParallelLayerAddedResult input2_added =
           pcg_add_input_layer(pcg, input_shape);
       parallel_layer_guid_t input2_layer = input2_added.parallel_layer;
-      UnmappedOpCostEstimateKey input2_key = make_input_key(par_input_shape);
+      UnmappedRuntimeOnlyOpCostEstimateKey input2_key =
+          make_input_key(par_input_shape);
 
       PCGBinarySPDecomposition sp_decomposition = pcg_make_parallel(
           pcg_make_leaf(input1_layer), pcg_make_leaf(input2_layer));
@@ -205,13 +212,15 @@ TEST_SUITE(FF_TEST_SUITE) {
           pcg_add_input_layer(pcg, input_shape);
       parallel_layer_guid_t input1_layer = input1_added.parallel_layer;
       parallel_tensor_guid_t input1_tensor = get_only(input1_added.outputs);
-      UnmappedOpCostEstimateKey input1_key = make_input_key(par_input_shape);
+      UnmappedRuntimeOnlyOpCostEstimateKey input1_key =
+          make_input_key(par_input_shape);
 
       ParallelLayerAddedResult input2_added =
           pcg_add_input_layer(pcg, input_shape);
       parallel_layer_guid_t input2_layer = input2_added.parallel_layer;
       parallel_tensor_guid_t input2_tensor = get_only(input2_added.outputs);
-      UnmappedOpCostEstimateKey input2_key = make_input_key(par_input_shape);
+      UnmappedRuntimeOnlyOpCostEstimateKey input2_key =
+          make_input_key(par_input_shape);
 
       PCGOperatorAttrs ew_op_attrs = PCGOperatorAttrs{
           ElementBinaryAttrs{
@@ -228,12 +237,13 @@ TEST_SUITE(FF_TEST_SUITE) {
                              {input1_tensor, input2_tensor},
                              {});
       parallel_layer_guid_t ew_op_layer = ew_op_added.parallel_layer;
-      UnmappedOpCostEstimateKey ew_op_key = UnmappedOpCostEstimateKey{
-          /*op_attrs=*/ew_op_attrs,
-          /*input_shapes=*/{par_input_shape, par_input_shape},
-          /*weight_shapes=*/{},
-          /*output_shapes=*/{ew_op_output_shape},
-      };
+      UnmappedRuntimeOnlyOpCostEstimateKey ew_op_key =
+          UnmappedRuntimeOnlyOpCostEstimateKey{
+              /*op_attrs=*/ew_op_attrs,
+              /*input_shapes=*/{par_input_shape, par_input_shape},
+              /*weight_shapes=*/{},
+              /*output_shapes=*/{ew_op_output_shape},
+          };
 
       PCGBinarySPDecomposition sp_decomposition =
           pcg_make_series(pcg_make_parallel(pcg_make_leaf(input1_layer),
diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc
index c7a757d91f..26f61253c3 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc
@@ -36,7 +36,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         },
     };
 
-    float pre_cost = 2.0;
+    milliseconds_t pre_cost = 2.0_ms;
     MachineMappingResult pre = MachineMappingResult{
         FeasibleMachineMappingResult{
             /*runtime=*/pre_cost,
@@ -58,7 +58,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         },
     };
 
-    float post_cost = 4.0;
+    milliseconds_t post_cost = 4.0_ms;
     MachineMappingResult post = MachineMappingResult{
         FeasibleMachineMappingResult{
             /*runtime=*/post_cost,
@@ -74,7 +74,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineMappingResult infeasible = infeasible_machine_mapping_result();
 
-    float comm_cost = 3.0;
+    milliseconds_t comm_cost = 3.0_ms;
 
     SUBCASE("pre is infeasible") {
       MachineMappingResult result = series_combine(
@@ -219,7 +219,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineMappingResult lhs = MachineMappingResult{
         FeasibleMachineMappingResult{
-            /*runtime=*/2.0,
+            /*runtime=*/2_ms,
             /*machine_mapping=*/
             ParallelLayerGuidObliviousMachineMapping{{
                 {
@@ -240,7 +240,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineMappingResult rhs = MachineMappingResult{
         FeasibleMachineMappingResult{
-            /*runtime=*/4.0,
+            /*runtime=*/4_ms,
             /*machine_mapping=*/
             ParallelLayerGuidObliviousMachineMapping{{
                 {
@@ -278,7 +278,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       MachineMappingResult result = parallel_combine(lhs, rhs);
       MachineMappingResult correct = MachineMappingResult{
           FeasibleMachineMappingResult{
-              /*runtime=*/4.0,
+              /*runtime=*/4_ms,
               /*machine_mapping=*/
               ParallelLayerGuidObliviousMachineMapping{{
                   {
@@ -342,7 +342,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineMappingResult faster = MachineMappingResult{
         FeasibleMachineMappingResult{
-            /*runtime=*/2.0,
+            /*runtime=*/2_ms,
             /*machine_mapping=*/
             ParallelLayerGuidObliviousMachineMapping{{
                 {
@@ -363,7 +363,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineMappingResult slower = MachineMappingResult{
         FeasibleMachineMappingResult{
-            /*runtime=*/4.0,
+            /*runtime=*/4_ms,
             /*machine_mapping=*/
             ParallelLayerGuidObliviousMachineMapping{{
                 {
diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
index 22202c36f7..96b11e6d33 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
@@ -1,10 +1,10 @@
 #include "compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h"
-#include "../../cost_estimator_for_test.h"
 #include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h"
 #include "compiler/machine_mapping/machine_mapping_constraints.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
 #include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.h"
+#include "internal/cost_estimator_for_test.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "pcg/machine_view.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
@@ -18,7 +18,8 @@ using namespace FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_optimal_machine_mapping_with_memory") {
     auto make_leaf = [](UnmappedOpCostEstimateKey const &k) {
-      return MachineMappingProblemTree{k};
+      return MachineMappingProblemTree{
+          runtime_only_from_unmapped_op_cost_estimate_key(k)};
     };
 
     auto make_series_split =
@@ -90,14 +91,15 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*intra_node_bandwidth=*/1,
     };
 
-    auto allowed_machine_views1 = [&](UnmappedOpCostEstimateKey const &,
-                                      MachineSpecification const &resources) {
-      if (resources == full_machine_spec) {
-        return std::unordered_set<MachineView>{mv1, mv2};
-      } else {
-        return std::unordered_set<MachineView>{mv2};
-      }
-    };
+    auto allowed_machine_views1 =
+        [&](UnmappedRuntimeOnlyOpCostEstimateKey const &,
+            MachineSpecification const &resources) {
+          if (resources == full_machine_spec) {
+            return std::unordered_set<MachineView>{mv1, mv2};
+          } else {
+            return std::unordered_set<MachineView>{mv2};
+          }
+        };
 
     TensorShape tensor_shape = TensorShape{
         TensorDims{
@@ -111,11 +113,21 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     ParallelTensorShape par_tensor_shape = lift_to_parallel(tensor_shape);
 
+    OptimizerAttrs optimizer_attrs = OptimizerAttrs{
+        SGDOptimizerAttrs{
+            /*lr=*/0.1,
+            /*momentum=*/0.1,
+            /*nesterov=*/false,
+            /*weight_decay=*/0.1,
+        },
+    };
+
     UnmappedOpCostEstimateKey k1 = UnmappedOpCostEstimateKey{
         /*op_attrs=*/PCGOperatorAttrs{InputAttrs{tensor_shape}},
         /*input_shapes=*/{},
         /*weight_shapes=*/{},
         /*output_shapes=*/{},
+        /*optimizer_attrs=*/optimizer_attrs,
     };
 
     UnmappedOpCostEstimateKey k2 = UnmappedOpCostEstimateKey{
@@ -128,6 +140,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*input_shapes=*/{},
         /*weight_shapes=*/{},
         /*output_shapes=*/{},
+        /*optimizer_attrs=*/optimizer_attrs,
     };
 
     AbstractedTensorSetMovement movement1 = AbstractedTensorSetMovement{{
@@ -150,36 +163,37 @@ TEST_SUITE(FF_TEST_SUITE) {
     CostEstimator cost_estimator = make_fake_cost_estimator(
         std::unordered_map<OpCostEstimateKey, OpCostMetrics>{{
             {map_unmapped_op_cost_estimate_key(k1, mv1),
-             OpCostMetrics{/*forward_runtime=*/1.0,
-                           /*backward_runtime=*/1.0,
-                           /*memory=*/nonnegative_int{2}}},
+             OpCostMetrics{/*forward_runtime=*/1_ms,
+                           /*backward_runtime=*/1_ms,
+                           /*memory_usage=*/2_bytes}},
             {map_unmapped_op_cost_estimate_key(k2, mv1),
-             OpCostMetrics{/*forward_runtime=*/2.0,
-                           /*backward_runtime=*/2.0,
-                           /*memory=*/nonnegative_int{3}}},
+             OpCostMetrics{/*forward_runtime=*/2_ms,
+                           /*backward_runtime=*/2_ms,
+                           /*memory_usage=*/3_bytes}},
             {map_unmapped_op_cost_estimate_key(k1, mv2),
-             OpCostMetrics{/*forward_runtime=*/1.5,
-                           /*backward_runtime=*/1.5,
-                           /*memory=*/nonnegative_int{1}}},
+             OpCostMetrics{/*forward_runtime=*/1.5_ms,
+                           /*backward_runtime=*/1.5_ms,
+                           /*memory_usage=*/1_bytes}},
             {map_unmapped_op_cost_estimate_key(k2, mv2),
-             OpCostMetrics{/*forward_runtime=*/2.5,
-                           /*backward_runtime=*/2.5,
-                           /*memory=*/nonnegative_int{2}}},
+             OpCostMetrics{/*forward_runtime=*/2.5_ms,
+                           /*backward_runtime=*/2.5_ms,
+                           /*memory_usage=*/2_bytes}},
         }},
-        std::unordered_map<TensorSetMovement, float>{{
-            {TensorSetMovement{/*movements=*/{}}, /*cost=*/0.0},
+        std::unordered_map<TensorSetMovement, milliseconds_t>{{
+            {TensorSetMovement{/*movements=*/{}}, /*cost=*/0.0_ms},
             {concretize_abstracted_tensor_set_movement(movement1, mm1, mm1),
-             /*cost=*/0.1},
+             /*cost=*/0.1_ms},
             {concretize_abstracted_tensor_set_movement(movement1, mm2, mm2),
-             /*cost=*/0.2},
+             /*cost=*/0.2_ms},
             {concretize_abstracted_tensor_set_movement(movement1, mm1, mm2),
-             /*cost=*/0.3},
+             /*cost=*/0.3_ms},
             {concretize_abstracted_tensor_set_movement(movement1, mm2, mm1),
-             /*cost=*/0.4},
+             /*cost=*/0.4_ms},
         }});
 
-    MachineMappingContext context = MachineMappingContext{
+    MachineMappingWithMemoryContext context = MachineMappingWithMemoryContext{
         cost_estimator,
+        optimizer_attrs,
         allowed_machine_views1,
     };
 
@@ -198,17 +212,17 @@ TEST_SUITE(FF_TEST_SUITE) {
               cache, context, problem_tree, full_machine_spec, constraints);
       MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{
           MachineMappingForSingleLayer{
-              OpCostMetrics{/*forward_runtime=*/1.0,
-                            /*backward_runtime=*/1.0,
-                            /*memory=*/nonnegative_int{2}},
+              OpCostMetrics{/*forward_runtime=*/1_ms,
+                            /*backward_runtime=*/1_ms,
+                            /*memory_usage=*/2_bytes},
               ParallelLayerGuidObliviousMachineMapping{{
                   {binary_tree_root_path(), mv1},
               }},
           },
           MachineMappingForSingleLayer{
-              OpCostMetrics{/*forward_runtime=*/1.5,
-                            /*backward_runtime=*/1.5,
-                            /*memory=*/nonnegative_int{1}},
+              OpCostMetrics{/*forward_runtime=*/1.5_ms,
+                            /*backward_runtime=*/1.5_ms,
+                            /*memory_usage=*/1_bytes},
               ParallelLayerGuidObliviousMachineMapping{{
                   {binary_tree_root_path(), mv2},
               }},
@@ -232,9 +246,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{
           MachineMappingForSingleLayer{
               OpCostMetrics{
-                  /*forward_runtime=*/1.0 + 2.0 + 0.1,
-                  /*backward_runtime=*/1.0 + 2.0 + 0.1,
-                  /*memory=*/nonnegative_int{2 + 3},
+                  /*forward_runtime=*/1.0_ms + 2.0_ms + 0.1_ms,
+                  /*backward_runtime=*/1.0_ms + 2.0_ms + 0.1_ms,
+                  /*memory_usage=*/2_bytes + 3_bytes,
               },
               ParallelLayerGuidObliviousMachineMapping{{
                   {
@@ -252,9 +266,9 @@ TEST_SUITE(FF_TEST_SUITE) {
               }},
           },
           MachineMappingForSingleLayer{
-              OpCostMetrics{/*forward_runtime=*/1.5 + 2.5 + 0.1,
-                            /*backward_runtime=*/1.5 + 2.5 + 0.1,
-                            /*memory=*/nonnegative_int{1 + 2}},
+              OpCostMetrics{/*forward_runtime=*/1.5_ms + 2.5_ms + 0.1_ms,
+                            /*backward_runtime=*/1.5_ms + 2.5_ms + 0.1_ms,
+                            /*memory_usage=*/1_bytes + 2_bytes},
               ParallelLayerGuidObliviousMachineMapping{{
                   {
                       BinaryTreePath{{
@@ -288,9 +302,9 @@ TEST_SUITE(FF_TEST_SUITE) {
               cache, context, problem_tree, full_machine_spec, constraints);
       MachineMappingWithMemoryResult correct =
           MachineMappingWithMemoryResult{{MachineMappingForSingleLayer{
-              OpCostMetrics{/*forward_runtime=*/2.5,
-                            /*backward_runtime=*/2.5,
-                            /*memory=*/nonnegative_int{2}},
+              OpCostMetrics{/*forward_runtime=*/2.5_ms,
+                            /*backward_runtime=*/2.5_ms,
+                            /*memory_usage=*/2_bytes},
               ParallelLayerGuidObliviousMachineMapping{{
                   {
                       BinaryTreePath{{
diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
index 35b55d2273..2192b442cd 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
@@ -53,21 +53,21 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     OpCostMetrics cost1 = OpCostMetrics{
-        /*forward_runtime=*/2.0,
-        /*backward_runtime=*/2.0,
-        /*memory=*/2_n,
+        /*forward_runtime=*/2_ms,
+        /*backward_runtime=*/2_ms,
+        /*memory_usage=*/2_bytes,
     };
 
     OpCostMetrics cost2 = OpCostMetrics{
-        /*forward_runtime=*/4.0,
-        /*backward_runtime=*/4.0,
-        /*memory=*/1_n,
+        /*forward_runtime=*/4_ms,
+        /*backward_runtime=*/4_ms,
+        /*memory_usage=*/1_bytes,
     };
 
     OpCostMetrics cost3 = OpCostMetrics{
-        /*forward_runtime=*/2.0,
-        /*backward_runtime=*/2.0,
-        /*memory=*/3_n,
+        /*forward_runtime=*/2_ms,
+        /*backward_runtime=*/2_ms,
+        /*memory_usage=*/3_bytes,
     };
 
     MachineMappingForSingleLayer mm1 = MachineMappingForSingleLayer{
@@ -188,9 +188,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     OpCostMetrics pre_cost = OpCostMetrics{
-        /*forward_runtime=*/2.0,
-        /*backward_runtime=*/2.0,
-        /*memory=*/2_n,
+        /*forward_runtime=*/2_ms,
+        /*backward_runtime=*/2_ms,
+        /*memory_usage=*/2_bytes,
     };
     MachineMappingWithMemoryResult pre = MachineMappingWithMemoryResult{{
         MachineMappingForSingleLayer{
@@ -215,9 +215,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     }};
 
     OpCostMetrics post_cost = OpCostMetrics{
-        /*forward_runtime=*/4.0,
-        /*backward_runtime=*/4.0,
-        /*memory=*/1_n,
+        /*forward_runtime=*/4_ms,
+        /*backward_runtime=*/4_ms,
+        /*memory_usage=*/1_bytes,
     };
 
     MachineMappingWithMemoryResult post = MachineMappingWithMemoryResult{{
@@ -237,7 +237,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     MachineMappingWithMemoryResult empty =
         empty_machine_mapping_with_memory_result();
 
-    float comm_cost = 3.0;
+    milliseconds_t comm_cost = 3_ms;
 
     SUBCASE("pre is empty") {
       MachineMappingWithMemoryResult result = series_combine(
@@ -265,7 +265,8 @@ TEST_SUITE(FF_TEST_SUITE) {
                               comm_cost + post_cost.forward_runtime,
                           /*backward_runtime=*/pre_cost.backward_runtime +
                               comm_cost + post_cost.backward_runtime,
-                          /*memory=*/pre_cost.memory + post_cost.memory,
+                          /*memory_usage=*/pre_cost.memory_usage +
+                              post_cost.memory_usage,
                       },
                       /*machine_mapping=*/
                       ParallelLayerGuidObliviousMachineMapping{{
@@ -321,7 +322,8 @@ TEST_SUITE(FF_TEST_SUITE) {
                             comm_cost + post_cost.forward_runtime,
                         /*backward_runtime=*/pre_cost.backward_runtime +
                             comm_cost + post_cost.backward_runtime,
-                        /*memory=*/pre_cost.memory + post_cost.memory,
+                        /*memory_usage=*/pre_cost.memory_usage +
+                            post_cost.memory_usage,
                     },
                     /*machine_mapping=*/
                     ParallelLayerGuidObliviousMachineMapping{{
@@ -389,9 +391,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     OpCostMetrics lhs_cost = OpCostMetrics{
-        /*forward_runtime=*/2.0,
-        /*backward_runtime=*/2.0,
-        /*memory=*/2_n,
+        /*forward_runtime=*/2_ms,
+        /*backward_runtime=*/2_ms,
+        /*memory_usage=*/2_bytes,
     };
     MachineMappingWithMemoryResult lhs = MachineMappingWithMemoryResult{{
         MachineMappingForSingleLayer{
@@ -416,9 +418,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     }};
 
     OpCostMetrics rhs_cost = OpCostMetrics{
-        /*forward_runtime=*/4.0,
-        /*backward_runtime=*/4.0,
-        /*memory=*/1_n,
+        /*forward_runtime=*/4_ms,
+        /*backward_runtime=*/4_ms,
+        /*memory_usage=*/1_bytes,
     };
     MachineMappingWithMemoryResult rhs = MachineMappingWithMemoryResult{{
         MachineMappingForSingleLayer{
@@ -461,7 +463,8 @@ TEST_SUITE(FF_TEST_SUITE) {
                   /*backward_runtime=*/
                   std::max(lhs_cost.backward_runtime,
                            rhs_cost.backward_runtime),
-                  /*memory=*/std::max(lhs_cost.memory, rhs_cost.memory),
+                  /*memory_usage=*/
+                  std::max(lhs_cost.memory_usage, rhs_cost.memory_usage),
               },
               /*machine_mapping=*/
               ParallelLayerGuidObliviousMachineMapping{
@@ -536,19 +539,19 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     OpCostMetrics cost1 = OpCostMetrics{
-        /*forward_runtime=*/2.0,
-        /*backward_runtime=*/2.0,
-        /*memory=*/2_n,
+        /*forward_runtime=*/2_ms,
+        /*backward_runtime=*/2_ms,
+        /*memory_usage=*/2_bytes,
     };
     OpCostMetrics cost2 = OpCostMetrics{
-        /*forward_runtime=*/4.0,
-        /*backward_runtime=*/4.0,
-        /*memory=*/1_n,
+        /*forward_runtime=*/4_ms,
+        /*backward_runtime=*/4_ms,
+        /*memory_usage=*/1_bytes,
     };
     OpCostMetrics cost3 = OpCostMetrics{
-        /*forward_runtime=*/2.0,
-        /*backward_runtime=*/2.0,
-        /*memory=*/3_n,
+        /*forward_runtime=*/2_ms,
+        /*backward_runtime=*/2_ms,
+        /*memory_usage=*/3_bytes,
     };
 
     MachineMappingForSingleLayer mm1 = MachineMappingForSingleLayer{
diff --git a/lib/compiler/test/src/compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.cc b/lib/compiler/test/src/compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.cc
index 13f15f6db3..81531d7073 100644
--- a/lib/compiler/test/src/compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.cc
+++ b/lib/compiler/test/src/compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.cc
@@ -296,7 +296,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       parallel_tensor_guid_t t_p4 = get_only(p4_added.outputs);
 
       RepartitionAttrs p5_attrs = RepartitionAttrs{
-          /*repartition_dim=*/ff_dim_t{0_n},
+          /*repartition_dim=*/ff_dim_t{1_n},
           /*repartition_degree=*/2_p,
       };
       ParallelLayerAddedResult p5_added =
diff --git a/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc b/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc
index c3c83dd6b8..6571b78540 100644
--- a/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc
+++ b/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc
@@ -1,10 +1,10 @@
 #include "compiler/task_graph_simulator/task_simulator.h"
-#include "../cost_estimator_for_test.h"
 #include "compiler/cost_estimator/cost_estimator.h"
 #include "compiler/cost_estimator/op_cost_metrics.dtg.h"
 #include "compiler/machine_mapping/machine_mapping.dtg.h"
 #include "compiler/machine_mapping/machine_mapping.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
+#include "internal/runtime_only_cost_estimator_for_test.h"
 #include "op-attrs/ops/input_attrs.dtg.h"
 #include "op-attrs/parallel_tensor_dims.dtg.h"
 #include "op-attrs/parallel_tensor_shape.dtg.h"
@@ -83,41 +83,45 @@ TEST_SUITE(FF_TEST_SUITE) {
       }};
 
       SUBCASE("constant op, comm cost") {
-        CostEstimator estimator = make_fake_constant_cost_estimator(
-            /*forward_op_cost=*/10.0f,
-            /*backward_op_cost=*/10.0f,
-            /*comm_cost=*/1.0f,
-            /*memory_cost=*/0_n);
+        RuntimeOnlyCostEstimator estimator =
+            make_fake_constant_runtime_only_cost_estimator(
+                /*forward_op_cost=*/10_ms,
+                /*backward_op_cost=*/10_ms,
+                /*comm_cost=*/1_ms);
 
-        float result = task_simulator_estimate_forward_pass_time(
+        milliseconds_t result = task_simulator_estimate_forward_pass_time(
             pcg, estimator, device_mapping, machine_spec);
 
-        float correct = 10 + 1 + 10;
+        milliseconds_t correct = 10_ms + 1_ms + 10_ms;
         CHECK(result == correct);
       }
 
       SUBCASE("variable op, comm cost") {
-        CostEstimator cost_estimator = make_fake_cost_estimator(
-            [](OpCostEstimateKey const &op) {
-              if (op.op_attrs.has<InputAttrs>()) {
-                return OpCostMetrics{/*forward_runtime=*/10.0f,
-                                     /*backward_runtime=*/10.0f,
-                                     /*memory=*/0_n}; // layer0
-              }
-              if (op.op_attrs.has<ElementUnaryAttrs>()) {
-                return OpCostMetrics{/*forward_runtime=*/1.0f,
-                                     /*backward_runtime=*/1.0f,
-                                     /*memory=*/0_n}; // layer1
-              }
-              return OpCostMetrics{/*forward_runtime=*/0.0f,
-                                   /*backward_runtime=*/0.0f,
-                                   /*memory=*/0_n};
-            },
-            [](TensorSetMovement const &comm) { return 5.0f; });
+        RuntimeOnlyCostEstimator cost_estimator =
+            make_fake_runtime_only_cost_estimator(
+                [](RuntimeOnlyOpCostEstimateKey const &key) {
+                  if (key.op_attrs.has<InputAttrs>()) {
+                    return RuntimeOnlyOpCostMetrics{
+                        /*forward_runtime=*/10_ms,
+                        /*backward_runtime=*/10_ms,
+                    }; // layer0
+                  } else if (key.op_attrs.has<ElementUnaryAttrs>()) {
+                    return RuntimeOnlyOpCostMetrics{
+                        /*forward_runtime=*/1_ms,
+                        /*backward_runtime=*/1_ms,
+                    }; // layer1
+                  } else {
+                    return RuntimeOnlyOpCostMetrics{
+                        /*forward_runtime=*/0_ms,
+                        /*backward_runtime=*/0_ms,
+                    };
+                  }
+                },
+                [](TensorSetMovement const &comm) { return 5_ms; });
 
-        float result = task_simulator_estimate_forward_pass_time(
+        milliseconds_t result = task_simulator_estimate_forward_pass_time(
             pcg, cost_estimator, device_mapping, machine_spec);
-        float correct = 10 + 5 + 1;
+        milliseconds_t correct = 10_ms + 5_ms + 1_ms;
         CHECK(result == correct);
       }
     }
@@ -173,41 +177,47 @@ TEST_SUITE(FF_TEST_SUITE) {
             {layer2, mv2},
             {layer3, mv3},
         }};
+
         SUBCASE("constant op, comm cost") {
-          CostEstimator estimator = make_fake_constant_cost_estimator(
-              /*forward_op_cost=*/10.0f,
-              /*backward_op_cost=*/10.0f,
-              /*comm_cost=*/1.0f,
-              /*memory_cost=*/0_n);
+          RuntimeOnlyCostEstimator estimator =
+              make_fake_constant_runtime_only_cost_estimator(
+                  /*forward_op_cost=*/10_ms,
+                  /*backward_op_cost=*/10_ms,
+                  /*comm_cost=*/1_ms);
 
-          float result = task_simulator_estimate_forward_pass_time(
+          milliseconds_t result = task_simulator_estimate_forward_pass_time(
               pcg, estimator, device_mapping, machine_spec);
-          float correct = 10 + 1 + 10 + 1 + 10;
+          milliseconds_t correct = 10_ms + 1_ms + 10_ms + 1_ms + 10_ms;
           CHECK(result == correct);
         }
+
         SUBCASE("variable op, comm cost") {
-          CostEstimator cost_estimator = make_fake_cost_estimator(
-              [](OpCostEstimateKey const &op) {
-                if (op.op_attrs.has<InputAttrs>()) {
-                  return OpCostMetrics{/*forward_runtime=*/10.0f,
-                                       /*backward_runtime=*/10.0f,
-                                       /*memory=*/0_n}; // layer0
-                }
-                if (op.op_attrs.has<ElementUnaryAttrs>()) {
-                  return OpCostMetrics{/*forward_runtime=*/1.0f,
-                                       /*backward_runtime=*/1.0f,
-                                       /*memory=*/0_n}; // layers 1, 2
-                }
-                if (op.op_attrs.has<ElementBinaryAttrs>()) {
-                  return OpCostMetrics{/*forward_runtime=*/2.0f,
-                                       /*backward_runtime=*/2.0f,
-                                       /*memory=*/0_n}; // layer3
-                }
-                return OpCostMetrics{/*forward_runtime=*/0.0f,
-                                     /*backward_runtime=*/0.0f,
-                                     /*memory=*/0_n};
-              },
-              [](TensorSetMovement const &comm) { return 5.0f; });
+          RuntimeOnlyCostEstimator cost_estimator =
+              make_fake_runtime_only_cost_estimator(
+                  [](RuntimeOnlyOpCostEstimateKey const &key) {
+                    if (key.op_attrs.has<InputAttrs>()) {
+                      return RuntimeOnlyOpCostMetrics{
+                          /*forward_runtime=*/10_ms,
+                          /*backward_runtime=*/10_ms,
+                      }; // layer0
+                    } else if (key.op_attrs.has<ElementUnaryAttrs>()) {
+                      return RuntimeOnlyOpCostMetrics{
+                          /*forward_runtime=*/1_ms,
+                          /*backward_runtime=*/1_ms,
+                      }; // layers 1, 2
+                    } else if (key.op_attrs.has<ElementBinaryAttrs>()) {
+                      return RuntimeOnlyOpCostMetrics{
+                          /*forward_runtime=*/2_ms,
+                          /*backward_runtime=*/2_ms,
+                      }; // layer3
+                    } else {
+                      return RuntimeOnlyOpCostMetrics{
+                          /*forward_runtime=*/0_ms,
+                          /*backward_runtime=*/0_ms,
+                      };
+                    }
+                  },
+                  [](TensorSetMovement const &comm) { return 5_ms; });
         }
       }
 
@@ -220,44 +230,50 @@ TEST_SUITE(FF_TEST_SUITE) {
             {layer2, mv},
             {layer3, mv},
         }};
+
         SUBCASE("constant op, cost cost") {
-          CostEstimator cost_estimator = make_fake_constant_cost_estimator(
-              /*forward_op_cost=*/10.0f,
-              /*backward_op_cost=*/10.0f,
-              /*comm_cost=*/1.0f,
-              /*memory_cost=*/0_n);
+          RuntimeOnlyCostEstimator cost_estimator =
+              make_fake_constant_runtime_only_cost_estimator(
+                  /*forward_op_cost=*/10_ms,
+                  /*backward_op_cost=*/10_ms,
+                  /*comm_cost=*/1_ms);
 
-          float result = task_simulator_estimate_forward_pass_time(
+          milliseconds_t result = task_simulator_estimate_forward_pass_time(
               pcg, cost_estimator, device_mapping, machine_spec);
-          float correct = 10 + 10 + 10 + 10 + 1 + 1;
+          milliseconds_t correct = 10_ms + 10_ms + 10_ms + 10_ms + 1_ms + 1_ms;
           CHECK(result == correct);
         }
+
         SUBCASE("variable op, cost cost") {
-          CostEstimator cost_estimator = make_fake_cost_estimator(
-              [](OpCostEstimateKey const &op) {
-                if (op.op_attrs.has<InputAttrs>()) {
-                  return OpCostMetrics{/*forward_runtime=*/10.0f,
-                                       /*backward_runtime=*/10.0f,
-                                       /*memory=*/0_n}; // layer0
-                }
-                if (op.op_attrs.has<ElementUnaryAttrs>()) {
-                  return OpCostMetrics{/*forward_runtime=*/1.0f,
-                                       /*backward_runtime=*/1.0f,
-                                       /*memory=*/0_n}; // layers 1, 2
-                }
-                if (op.op_attrs.has<ElementBinaryAttrs>()) {
-                  return OpCostMetrics{/*forward_runtime=*/2.0f,
-                                       /*backward_runtime=*/2.0f,
-                                       /*memory=*/0_n}; // layer3
-                }
-                return OpCostMetrics{/*forward_runtime=*/0.0f,
-                                     /*backward_runtime=*/0.0f,
-                                     /*memory=*/0_n};
-              },
-              [](TensorSetMovement const &comm) { return 5.0f; });
-          float result = task_simulator_estimate_forward_pass_time(
+          RuntimeOnlyCostEstimator cost_estimator =
+              make_fake_runtime_only_cost_estimator(
+                  [](RuntimeOnlyOpCostEstimateKey const &key) {
+                    if (key.op_attrs.has<InputAttrs>()) {
+                      return RuntimeOnlyOpCostMetrics{
+                          /*forward_runtime=*/10_ms,
+                          /*backward_runtime=*/10_ms,
+                      }; // layer0
+                    } else if (key.op_attrs.has<ElementUnaryAttrs>()) {
+                      return RuntimeOnlyOpCostMetrics{
+                          /*forward_runtime=*/1_ms,
+                          /*backward_runtime=*/1_ms,
+                      }; // layers 1, 2
+                    } else if (key.op_attrs.has<ElementBinaryAttrs>()) {
+                      return RuntimeOnlyOpCostMetrics{
+                          /*forward_runtime=*/2_ms,
+                          /*backward_runtime=*/2_ms,
+                      }; // layer3
+                    } else {
+                      return RuntimeOnlyOpCostMetrics{
+                          /*forward_runtime=*/0_ms,
+                          /*backward_runtime=*/0_ms,
+                      };
+                    }
+                  },
+                  [](TensorSetMovement const &comm) { return 5_ms; });
+          milliseconds_t result = task_simulator_estimate_forward_pass_time(
               pcg, cost_estimator, device_mapping, machine_spec);
-          float correct = 10 + 5 + (1 + 1) + 5 + 2;
+          milliseconds_t correct = 10_ms + 5_ms + (1_ms + 1_ms) + 5_ms + 2_ms;
           CHECK(result == correct);
         }
       }
diff --git a/lib/compiler/test/src/compiler/cost_estimator_for_test.cc b/lib/compiler/test/src/internal/cost_estimator_for_test.cc
similarity index 73%
rename from lib/compiler/test/src/compiler/cost_estimator_for_test.cc
rename to lib/compiler/test/src/internal/cost_estimator_for_test.cc
index 48e6f5e561..60bf6ba7a4 100644
--- a/lib/compiler/test/src/compiler/cost_estimator_for_test.cc
+++ b/lib/compiler/test/src/internal/cost_estimator_for_test.cc
@@ -9,7 +9,7 @@ namespace FlexFlow {
 TestCostEstimator::TestCostEstimator(
     std::function<OpCostMetrics(OpCostEstimateKey const &)> const
         &get_operator_cost,
-    std::function<float(TensorSetMovement const &)> const
+    std::function<milliseconds_t(TensorSetMovement const &)> const
         &get_communication_cost)
     : get_operator_cost(get_operator_cost),
       get_communication_cost(get_communication_cost) {}
@@ -19,14 +19,15 @@ OpCostMetrics
   return this->get_operator_cost(k);
 }
 
-float TestCostEstimator::estimate_cost(TensorSetMovement const &m) const {
+milliseconds_t
+    TestCostEstimator::estimate_cost(TensorSetMovement const &m) const {
   return this->get_communication_cost(m);
 }
 
 CostEstimator make_fake_cost_estimator(
     std::function<OpCostMetrics(OpCostEstimateKey const &)> const
         &get_operator_cost,
-    std::function<float(TensorSetMovement const &)> const
+    std::function<milliseconds_t(TensorSetMovement const &)> const
         &get_communication_cost) {
   return CostEstimator::create<TestCostEstimator>(get_operator_cost,
                                                   get_communication_cost);
@@ -34,7 +35,8 @@ CostEstimator make_fake_cost_estimator(
 
 CostEstimator make_fake_cost_estimator(
     std::unordered_map<OpCostEstimateKey, OpCostMetrics> const &op_cost_map,
-    std::unordered_map<TensorSetMovement, float> const &comm_cost_map) {
+    std::unordered_map<TensorSetMovement, milliseconds_t> const
+        &comm_cost_map) {
   return make_fake_cost_estimator(
       [op_cost_map](OpCostEstimateKey const &k) { return op_cost_map.at(k); },
       [comm_cost_map](TensorSetMovement const &m) {
@@ -42,10 +44,10 @@ CostEstimator make_fake_cost_estimator(
       });
 }
 
-CostEstimator make_fake_constant_cost_estimator(float forward_op_cost,
-                                                float backward_op_cost,
-                                                float comm_cost,
-                                                nonnegative_int memory_cost) {
+CostEstimator make_fake_constant_cost_estimator(milliseconds_t forward_op_cost,
+                                                milliseconds_t backward_op_cost,
+                                                milliseconds_t comm_cost,
+                                                num_bytes_t memory_cost) {
   return make_fake_cost_estimator(
       [=](OpCostEstimateKey const &op) {
         return OpCostMetrics{forward_op_cost, backward_op_cost, memory_cost};
diff --git a/lib/compiler/test/src/compiler/cost_estimator_for_test.h b/lib/compiler/test/src/internal/cost_estimator_for_test.h
similarity index 69%
rename from lib/compiler/test/src/compiler/cost_estimator_for_test.h
rename to lib/compiler/test/src/internal/cost_estimator_for_test.h
index 1e8ce83caf..6a0094839c 100644
--- a/lib/compiler/test/src/compiler/cost_estimator_for_test.h
+++ b/lib/compiler/test/src/internal/cost_estimator_for_test.h
@@ -13,7 +13,8 @@ namespace FlexFlow {
 
 struct TestCostEstimator : public ICostEstimator {
   std::function<OpCostMetrics(OpCostEstimateKey const &)> get_operator_cost;
-  std::function<float(TensorSetMovement const &)> get_communication_cost;
+  std::function<milliseconds_t(TensorSetMovement const &)>
+      get_communication_cost;
 
   TestCostEstimator() = delete;
   TestCostEstimator(decltype(get_operator_cost) const &get_operator_cost,
@@ -22,23 +23,23 @@ struct TestCostEstimator : public ICostEstimator {
 
   OpCostMetrics estimate_cost(OpCostEstimateKey const &) const override;
 
-  float estimate_cost(TensorSetMovement const &) const override;
+  milliseconds_t estimate_cost(TensorSetMovement const &) const override;
 };
 
 CostEstimator make_fake_cost_estimator(
     std::function<OpCostMetrics(OpCostEstimateKey const &)> const
         &get_operator_cost,
-    std::function<float(TensorSetMovement const &)> const
+    std::function<milliseconds_t(TensorSetMovement const &)> const
         &get_communication_cost);
 
 CostEstimator make_fake_cost_estimator(
     std::unordered_map<OpCostEstimateKey, OpCostMetrics> const &op_cost_map,
-    std::unordered_map<TensorSetMovement, float> const &comm_cost_map);
+    std::unordered_map<TensorSetMovement, milliseconds_t> const &comm_cost_map);
 
-CostEstimator make_fake_constant_cost_estimator(float forward_op_cost,
-                                                float backward_op_cost,
-                                                float comm_cost,
-                                                nonnegative_int memory_cost);
+CostEstimator make_fake_constant_cost_estimator(milliseconds_t forward_op_cost,
+                                                milliseconds_t backward_op_cost,
+                                                milliseconds_t comm_cost,
+                                                num_bytes_t memory_cost);
 
 } // namespace FlexFlow
 
diff --git a/lib/compiler/test/src/internal/runtime_only_cost_estimator_for_test.cc b/lib/compiler/test/src/internal/runtime_only_cost_estimator_for_test.cc
new file mode 100644
index 0000000000..c52344c6b3
--- /dev/null
+++ b/lib/compiler/test/src/internal/runtime_only_cost_estimator_for_test.cc
@@ -0,0 +1,52 @@
+#include "internal/runtime_only_cost_estimator_for_test.h"
+#include "compiler/cost_estimator/op_cost_estimate_key.dtg.h"
+#include "compiler/cost_estimator/op_cost_estimate_key.h"
+#include "compiler/cost_estimator/op_cost_metrics.dtg.h"
+#include "compiler/cost_estimator/op_cost_metrics.h"
+#include "compiler/cost_estimator/runtime_only_cost_estimator_from_cost_estimator.h"
+#include "internal/cost_estimator_for_test.h"
+
+namespace FlexFlow {
+
+RuntimeOnlyCostEstimator make_fake_runtime_only_cost_estimator(
+    std::function<RuntimeOnlyOpCostMetrics(
+        RuntimeOnlyOpCostEstimateKey const &)> const &get_operator_cost,
+    std::function<milliseconds_t(TensorSetMovement const &)> const
+        &get_communication_cost) {
+  return runtime_only_cost_estimator_from_cost_estimator(
+      make_fake_cost_estimator(
+          [get_operator_cost](OpCostEstimateKey const &key) -> OpCostMetrics {
+            RuntimeOnlyOpCostMetrics runtime_only_op_cost_metrics =
+                get_operator_cost(runtime_only_from_op_cost_estimate_key(key));
+            return make_op_cost_metrics_from_runtime_only(
+                runtime_only_op_cost_metrics, 0_bytes);
+          },
+          get_communication_cost));
+}
+
+RuntimeOnlyCostEstimator make_fake_runtime_only_cost_estimator(
+    std::unordered_map<RuntimeOnlyOpCostEstimateKey,
+                       RuntimeOnlyOpCostMetrics> const &op_cost_map,
+    std::unordered_map<TensorSetMovement, milliseconds_t> const
+        &comm_cost_map) {
+  return make_fake_runtime_only_cost_estimator(
+      [op_cost_map](RuntimeOnlyOpCostEstimateKey const &k) {
+        return op_cost_map.at(k);
+      },
+      [comm_cost_map](TensorSetMovement const &m) {
+        return comm_cost_map.at(m);
+      });
+}
+
+RuntimeOnlyCostEstimator make_fake_constant_runtime_only_cost_estimator(
+    milliseconds_t forward_op_cost,
+    milliseconds_t backward_op_cost,
+    milliseconds_t comm_cost) {
+  return make_fake_runtime_only_cost_estimator(
+      [=](RuntimeOnlyOpCostEstimateKey const &op) {
+        return RuntimeOnlyOpCostMetrics{forward_op_cost, backward_op_cost};
+      },
+      [=](TensorSetMovement const &op) { return comm_cost; });
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/test/src/internal/runtime_only_cost_estimator_for_test.h b/lib/compiler/test/src/internal/runtime_only_cost_estimator_for_test.h
new file mode 100644
index 0000000000..2b5824263d
--- /dev/null
+++ b/lib/compiler/test/src/internal/runtime_only_cost_estimator_for_test.h
@@ -0,0 +1,26 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_TEST_SRC_INTERNAL_RUNTIME_ONLY_COST_ESTIMATOR_FOR_TEST_H
+#define _FLEXFLOW_LIB_COMPILER_TEST_SRC_INTERNAL_RUNTIME_ONLY_COST_ESTIMATOR_FOR_TEST_H
+
+#include "compiler/cost_estimator/runtime_only_cost_estimator.h"
+
+namespace FlexFlow {
+
+RuntimeOnlyCostEstimator make_fake_runtime_only_cost_estimator(
+    std::function<RuntimeOnlyOpCostMetrics(
+        RuntimeOnlyOpCostEstimateKey const &)> const &get_operator_cost,
+    std::function<milliseconds_t(TensorSetMovement const &)> const
+        &get_communication_cost);
+
+RuntimeOnlyCostEstimator make_fake_runtime_only_cost_estimator(
+    std::unordered_map<RuntimeOnlyOpCostEstimateKey,
+                       RuntimeOnlyOpCostMetrics> const &op_cost_map,
+    std::unordered_map<TensorSetMovement, milliseconds_t> const &comm_cost_map);
+
+RuntimeOnlyCostEstimator make_fake_constant_runtime_only_cost_estimator(
+    milliseconds_t forward_op_cost,
+    milliseconds_t backward_op_cost,
+    milliseconds_t comm_cost);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h
index eb2a431bd1..ec0d6fde0d 100644
--- a/lib/kernels/include/kernels/accessor.h
+++ b/lib/kernels/include/kernels/accessor.h
@@ -1,25 +1,29 @@
-#ifndef _FLEXFLOW_KERNELS_ACCESSOR_H
-#define _FLEXFLOW_KERNELS_ACCESSOR_H
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ACCESSOR_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ACCESSOR_H
 
-#include "kernels/array_shape.h"
 #include "kernels/device.h"
 #include "kernels/ff_handle.h"
+#include "kernels/legion_dim.h"
+#include "kernels/legion_ordered/legion_ordered.h"
 #include "op-attrs/datatype.h"
+#include "op-attrs/tensor_dims.dtg.h"
+#include "op-attrs/tensor_dims.h"
+#include "op-attrs/tensor_shape.dtg.h"
 #include "pcg/device_type.dtg.h"
 #include "utils/containers/transform.h"
 #include <libassert/assert.hpp>
+#include <string>
 
 namespace FlexFlow {
 
-nonnegative_int
-    calculate_accessor_offset(LegionOrdered<nonnegative_int> const &,
-                              ArrayShape const &);
+nonnegative_int calculate_accessor_offset(TensorDimsCoord const &,
+                                          TensorDims const &);
 
 class GenericTensorAccessorR {
 public:
   template <DataType DT>
   typename data_type_enum_to_class<DT>::type const *get() const {
-    ASSERT(this->data_type == DT, "Invalid datatype requested");
+    ASSERT(this->shape.data_type == DT, "Invalid datatype requested");
 
     return static_cast<real_type_t<DT> const *>(this->ptr);
   }
@@ -32,8 +36,7 @@ class GenericTensorAccessorR {
 
   GenericTensorAccessorR() = delete;
 
-  GenericTensorAccessorR(DataType data_type,
-                         ArrayShape const &shape,
+  GenericTensorAccessorR(TensorShape const &shape,
                          void const *ptr,
                          DeviceType device_type);
 
@@ -41,32 +44,25 @@ class GenericTensorAccessorR {
   bool operator!=(GenericTensorAccessorR const &) const;
 
   template <DataType DT>
-  real_type_t<DT> const &at(FFOrdered<nonnegative_int> const &indices) const {
-    return this->at<DT>(legion_ordered_from_ff_ordered(indices));
-  }
-
-  template <DataType DT>
-  real_type_t<DT> const &
-      at(LegionOrdered<nonnegative_int> const &indices) const {
+  real_type_t<DT> const &at(TensorDimsCoord const &indices) const {
     ASSERT(this->device_type == DeviceType::CPU,
            "GenericTensorAccessorR::at() requires CPU-allocated tensor");
-    ASSERT(this->data_type == DT, "Invalid datatype requested");
+    ASSERT(this->shape.data_type == DT, "Invalid datatype requested");
 
     using T = real_type_t<DT>;
     T const *data_ptr = static_cast<T const *>(this->ptr);
-    nonnegative_int offset = calculate_accessor_offset(indices, this->shape);
+    nonnegative_int offset =
+        calculate_accessor_offset(indices, this->shape.dims);
     return data_ptr[offset.unwrap_nonnegative()];
   }
 
 public:
-  DataType data_type;
-  ArrayShape shape;
+  TensorShape shape;
   void const *ptr;
   DeviceType device_type;
 
 private:
-  std::tuple<decltype(data_type) const &,
-             decltype(shape) const &,
+  std::tuple<decltype(shape) const &,
              decltype(ptr) const &,
              decltype(device_type) const &>
       tie() const;
@@ -79,7 +75,7 @@ class GenericTensorAccessorW {
 public:
   template <DataType DT>
   typename data_type_enum_to_class<DT>::type *get() const {
-    ASSERT(this->data_type == DT, "Invalid datatype requested");
+    ASSERT(this->shape.data_type == DT, "Invalid datatype requested");
 
     return static_cast<real_type_t<DT> *>(this->ptr);
   }
@@ -92,8 +88,7 @@ class GenericTensorAccessorW {
 
   GenericTensorAccessorW() = delete;
 
-  GenericTensorAccessorW(DataType data_type,
-                         ArrayShape const &shape,
+  GenericTensorAccessorW(TensorShape const &shape,
                          void *ptr,
                          DeviceType device_type);
 
@@ -103,48 +98,38 @@ class GenericTensorAccessorW {
   operator GenericTensorAccessorR() const;
 
   template <DataType DT>
-  real_type_t<DT> &at(FFOrdered<nonnegative_int> const &indices) {
-    return this->at<DT>(legion_ordered_from_ff_ordered(indices));
-  }
-
-  template <DataType DT>
-  real_type_t<DT> &at(LegionOrdered<nonnegative_int> const &indices) {
+  real_type_t<DT> &at(TensorDimsCoord const &indices) {
     ASSERT(this->device_type == DeviceType::CPU,
            "GenericTensorAccessorW::at() requires CPU-allocated tensor");
-    ASSERT(this->data_type == DT, "Invalid datatype requested");
+    ASSERT(this->shape.data_type == DT, "Invalid datatype requested");
 
     using T = real_type_t<DT>;
     T *data_ptr = static_cast<T *>(this->ptr);
-    nonnegative_int offset = calculate_accessor_offset(indices, this->shape);
+    nonnegative_int offset =
+        calculate_accessor_offset(indices, this->shape.dims);
     return data_ptr[offset.unwrap_nonnegative()];
   }
 
   template <DataType DT>
-  real_type_t<DT> const &at(FFOrdered<nonnegative_int> const &indices) const {
-    return this->at<DT>(legion_ordered_from_ff_ordered(indices));
-  }
-
-  template <DataType DT>
-  real_type_t<DT> &at(LegionOrdered<nonnegative_int> const &indices) const {
+  real_type_t<DT> &at(TensorDimsCoord const &indices) const {
     ASSERT(this->device_type == DeviceType::CPU,
            "GenericTensorAccessorW::at() requires CPU-allocated tensor");
-    ASSERT(this->data_type == DT, "Invalid datatype requested");
+    ASSERT(this->shape.data_type == DT, "Invalid datatype requested");
 
     using T = real_type_t<DT>;
-    T const *data_ptr = static_cast<T const *>(this->ptr);
-    nonnegative_int offset = calculate_accessor_offset(indices, this->shape);
-    return data_ptr[offset];
+    T *data_ptr = static_cast<T *>(this->ptr);
+    nonnegative_int offset =
+        calculate_accessor_offset(indices, this->shape.dims);
+    return data_ptr[offset.unwrap_nonnegative()];
   }
 
 public:
-  DataType data_type;
-  ArrayShape shape;
+  TensorShape shape;
   void *ptr;
   DeviceType device_type;
 
 private:
-  std::tuple<decltype(data_type) const &,
-             decltype(shape) const &,
+  std::tuple<decltype(shape) const &,
              decltype(ptr) const &,
              decltype(device_type) const &>
       tie() const;
@@ -156,7 +141,7 @@ std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &);
 template <DataType DT>
 typename data_type_enum_to_class<DT>::type *
     get(GenericTensorAccessorW const &a) {
-  ASSERT(a.data_type == DT, "Invalid datatype requested");
+  ASSERT(a.shape.data_type == DT, "Invalid datatype requested");
   return static_cast<real_type_t<DT> *>(a.ptr);
 }
 
@@ -173,7 +158,7 @@ std::vector<real_type_t<DT> *>
 template <DataType DT>
 typename data_type_enum_to_class<DT>::type const *
     get(GenericTensorAccessorR const &a) {
-  ASSERT(a.data_type == DT, "Invalid datatype requested");
+  ASSERT(a.shape.data_type == DT, "Invalid datatype requested");
   return static_cast<real_type_t<DT> const *>(a.ptr);
 }
 
@@ -221,30 +206,16 @@ std::vector<real_type_t<DT> const *>
 GenericTensorAccessorR read_only_accessor_from_write_accessor(
     GenericTensorAccessorW const &write_accessor);
 
-bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1,
-                              GenericTensorAccessorR const &acc2);
-bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1,
-                              GenericTensorAccessorW const &acc2);
-
-bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
-                             ArrayShape const &expected_shape,
-                             DataType const &expected_dtype);
-bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor,
-                             ArrayShape const &expected_shape,
-                             DataType const &expected_dtype);
-
-std::pair<ArrayShape, DataType>
-    get_shape_and_datatype(GenericTensorAccessorR const &accessor);
-std::pair<ArrayShape, DataType>
-    get_shape_and_datatype(GenericTensorAccessorW const &accessor);
+TensorShape get_tensor_shape_for_accessor_r(GenericTensorAccessorR const &);
+TensorShape get_tensor_shape_for_accessor_w(GenericTensorAccessorW const &);
 
-void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor,
+void copy_accessor_data_to_l_from_r(GenericTensorAccessorW const &dst_accessor,
                                     GenericTensorAccessorR const &src_accessor);
 
 template <DataType DT>
 real_type_t<DT> accessor_get_only_value(GenericTensorAccessorR const &acc) {
-  ASSERT(get_num_elements(acc.shape) == 1);
-  ASSERT(acc.data_type == DT);
+  ASSERT(get_num_elements(acc.shape.dims) == 1);
+  ASSERT(acc.shape.data_type == DT);
 
   return *static_cast<real_type_t<DT> const *>(acc.ptr);
 }
diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h
index 39bad6599c..0863e2d0ac 100644
--- a/lib/kernels/include/kernels/allocation.h
+++ b/lib/kernels/include/kernels/allocation.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_KERNELS_ALLOCATION_H
-#define _FLEXFLOW_KERNELS_ALLOCATION_H
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ALLOCATION_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ALLOCATION_H
 
 #include "kernels/accessor.h"
 #include <cstddef>
diff --git a/lib/kernels/include/kernels/array_coord.h b/lib/kernels/include/kernels/array_coord.h
deleted file mode 100644
index 730bb49e81..0000000000
--- a/lib/kernels/include/kernels/array_coord.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ARRAY_COORD_H
-#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ARRAY_COORD_H
-
-#include "kernels/array_coord.dtg.h"
-
-namespace FlexFlow {
-
-ArrayCoord
-    array_coord_drop_dims(ArrayCoord const &coord,
-                          std::function<bool(ff_dim_t)> const &should_drop_dim);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h
deleted file mode 100644
index 2b1397dc0e..0000000000
--- a/lib/kernels/include/kernels/array_shape.h
+++ /dev/null
@@ -1,79 +0,0 @@
-#ifndef _FLEXFLOW_KERNELS_ARRAY_SHAPE_H
-#define _FLEXFLOW_KERNELS_ARRAY_SHAPE_H
-
-#include "kernels/array_coord.dtg.h"
-#include "kernels/legion_dim.h"
-#include "op-attrs/tensor_shape.dtg.h"
-#include "utils/positive_int/positive_int.h"
-#include "utils/stack_vector/stack_vector.h"
-#include "utils/visitable.h"
-#include <cstddef>
-#include <optional>
-#include <vector>
-
-namespace FlexFlow {
-
-struct ArrayShape {
-public:
-  ArrayShape() = delete;
-  explicit ArrayShape(LegionOrdered<positive_int> const &dims);
-
-  positive_int num_elements() const;
-
-  nonnegative_int num_dims() const;
-
-  positive_int operator[](legion_dim_t) const;
-  positive_int at(legion_dim_t) const;
-  positive_int at(ff_dim_t) const;
-
-  bool operator==(ArrayShape const &) const;
-  bool operator!=(ArrayShape const &) const;
-
-  legion_dim_t last_idx() const;
-  legion_dim_t neg_idx(int) const;
-
-  std::optional<positive_int> at_maybe(legion_dim_t) const;
-  std::optional<positive_int> at_maybe(ff_dim_t) const;
-
-  ArrayShape sub_shape(ff_dim_t const &start,
-                       std::optional<ff_dim_t> const &end) const;
-
-  ArrayShape sub_shape(legion_dim_t const &start,
-                       std::optional<legion_dim_t> const &end) const;
-
-public:
-  LegionOrdered<positive_int> dims;
-
-private:
-  std::tuple<decltype(dims) const &> tie() const;
-
-  friend ::std::hash<ArrayShape>;
-};
-
-std::string format_as(ArrayShape const &);
-std::ostream &operator<<(std::ostream &, ArrayShape const &);
-
-positive_int get_num_elements(ArrayShape const &);
-
-ArrayShape array_shape_from_tensor_shape(TensorShape const &);
-TensorShape get_tensor_shape(ArrayShape const &, DataType);
-
-std::unordered_set<ff_dim_t> get_ff_dim_t_set(ArrayShape const &);
-std::unordered_set<ArrayCoord> get_array_coord_set(ArrayShape const &);
-
-ArrayShape
-    array_shape_drop_dims(ArrayShape const &shape,
-                          std::function<bool(ff_dim_t)> const &should_drop_dim);
-
-} // namespace FlexFlow
-
-namespace std {
-
-template <>
-struct hash<::FlexFlow::ArrayShape> {
-  size_t operator()(::FlexFlow::ArrayShape const &) const;
-};
-
-} // namespace std
-
-#endif
diff --git a/lib/kernels/include/kernels/attention_kernels.h b/lib/kernels/include/kernels/attention_kernels.h
index b3c77d3430..0c5caabfa0 100644
--- a/lib/kernels/include/kernels/attention_kernels.h
+++ b/lib/kernels/include/kernels/attention_kernels.h
@@ -3,93 +3,39 @@
 
 #include "kernels/allocation.h"
 #include "kernels/device.h"
-#include "kernels/ff_handle.h"
-#include <memory>
-
-namespace FlexFlow {
-
-struct MHAPerDeviceState {
-  PerDeviceFFHandle handle;
-  size_t weightSize;
-  size_t reserveSpaceSize;
-  ffAttnDescriptor_t attnDesc;
-  ffSeqDataDescriptor_t qDesc;
-  ffSeqDataDescriptor_t kDesc;
-  ffSeqDataDescriptor_t vDesc;
-  ffSeqDataDescriptor_t oDesc;
-  int *devQoSeqArray;
-  int *devKvSeqArray;
-  int *loWinIdx;
-  int *hiWinIdx;
-  void *reserveSpace;
-  Allocator allocator;
-
-  bool operator==(MHAPerDeviceState const &other) const;
-  bool operator!=(MHAPerDeviceState const &other) const;
-
-private:
-  std::tuple<decltype(handle) const &,
-             decltype(weightSize) const &,
-             decltype(reserveSpaceSize) const &,
-             decltype(attnDesc) const &,
-             decltype(qDesc) const &,
-             decltype(kDesc) const &,
-             decltype(vDesc) const &,
-             decltype(oDesc) const &,
-             decltype(devQoSeqArray) const &,
-             decltype(devKvSeqArray) const &,
-             decltype(loWinIdx) const &,
-             decltype(hiWinIdx) const &,
-             decltype(reserveSpace) const &>
-      tie() const;
-};
-
-FF_VISITABLE_STRUCT_NO_EQ(MHAPerDeviceState,
-                          handle,
-                          weightSize,
-                          reserveSpaceSize,
-                          attnDesc,
-                          qDesc,
-                          kDesc,
-                          vDesc,
-                          oDesc,
-                          devQoSeqArray,
-                          devKvSeqArray,
-                          loWinIdx,
-                          hiWinIdx,
-                          reserveSpace,
-                          allocator);
-
-std::string format_as(MHAPerDeviceState const &x);
-std::ostream &operator<<(std::ostream &s, MHAPerDeviceState const &x);
-
-namespace Kernels::MultiHeadAttention {
-
-MHAPerDeviceState init_kernel(PerDeviceFFHandle const &,
-                              Allocator &,
-                              int num_samples,
-                              int num_heads,
-                              int qSize,
-                              int kSize,
-                              int vSize,
-                              int qProjSize,
-                              int kProjSize,
-                              int vProjSize,
-                              int oProjSize,
-                              int qoSeqLength,
-                              int kvSeqLength,
-                              bool add_bias_kv);
-
-void forward_kernel(ffStream_t stream,
-                    MHAPerDeviceState const &device_state,
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/device_stream_t.dtg.h"
+#include "kernels/mha_per_device_state.dtg.h"
+
+namespace FlexFlow::Kernels::MultiHeadAttention {
+
+std::optional<MHAPerDeviceState>
+    init_kernel(DeviceType device_type,
+                device_handle_t const &per_device_ff_handle,
+                Allocator &allocator,
+                int num_samples,
+                int num_heads,
+                int qSize,
+                int kSize,
+                int vSize,
+                int qProjSize,
+                int kProjSize,
+                int vProjSize,
+                int oProjSize,
+                int qoSeqLength,
+                int kvSeqLength,
+                bool add_bias_kv);
+
+void forward_kernel(device_stream_t const &stream,
+                    std::optional<MHAPerDeviceState> const &device_state,
                     float const *query_ptr,
                     float const *key_ptr,
                     float const *value_ptr,
                     float const *weight_ptr,
                     float *output_ptr);
 
-void backward_kernel(ffStream_t stream,
-                     MHAPerDeviceState const &device_state,
+void backward_kernel(device_stream_t const &stream,
+                     std::optional<MHAPerDeviceState> const &device_state,
                      float const *query_ptr,
                      float *query_grad_ptr,
                      float const *key_ptr,
@@ -100,10 +46,10 @@ void backward_kernel(ffStream_t stream,
                      float *weight_grad_ptr,
                      float const *output_grad_ptr);
 
-void cleanup_kernel(Allocator &allocator,
-                    MHAPerDeviceState const &device_state);
+void cleanup_kernel(DeviceType device_type,
+                    Allocator &allocator,
+                    std::optional<MHAPerDeviceState> const &device_state);
 
-} // namespace Kernels::MultiHeadAttention
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::MultiHeadAttention
 
 #endif
diff --git a/lib/kernels/include/kernels/attention_kernels_cpu.h b/lib/kernels/include/kernels/attention_kernels_cpu.h
new file mode 100644
index 0000000000..3dfdb45d42
--- /dev/null
+++ b/lib/kernels/include/kernels/attention_kernels_cpu.h
@@ -0,0 +1,31 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ATTENTION_KERNELS_CPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ATTENTION_KERNELS_CPU_H
+
+#include "kernels/allocation.h"
+#include "kernels/device.h"
+#include "kernels/device_stream_t.dtg.h"
+#include "kernels/ff_handle.h"
+#include "kernels/mha_per_device_state.dtg.h"
+#include <memory>
+
+namespace FlexFlow::Kernels::MultiHeadAttention {
+
+void cpu_forward_kernel(float const *query_ptr,
+                        float const *key_ptr,
+                        float const *value_ptr,
+                        float const *weight_ptr,
+                        float *output_ptr);
+
+void cpu_backward_kernel(float const *query_ptr,
+                         float *query_grad_ptr,
+                         float const *key_ptr,
+                         float *key_grad_ptr,
+                         float const *value_ptr,
+                         float *value_grad_ptr,
+                         float const *weight_ptr,
+                         float *weight_grad_ptr,
+                         float const *output_grad_ptr);
+
+} // namespace FlexFlow::Kernels::MultiHeadAttention
+
+#endif
diff --git a/lib/kernels/include/kernels/attention_kernels_gpu.h b/lib/kernels/include/kernels/attention_kernels_gpu.h
new file mode 100644
index 0000000000..655477a6b0
--- /dev/null
+++ b/lib/kernels/include/kernels/attention_kernels_gpu.h
@@ -0,0 +1,52 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ATTENTION_KERNELS_GPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ATTENTION_KERNELS_GPU_H
+
+#include "kernels/allocation.h"
+#include "kernels/device.h"
+#include "kernels/device_stream_t.dtg.h"
+#include "kernels/ff_handle.h"
+#include "kernels/mha_per_device_state.dtg.h"
+
+namespace FlexFlow::Kernels::MultiHeadAttention {
+
+MHAPerDeviceState gpu_init_kernel(PerDeviceFFHandle const &,
+                                  Allocator &,
+                                  int num_samples,
+                                  int num_heads,
+                                  int qSize,
+                                  int kSize,
+                                  int vSize,
+                                  int qProjSize,
+                                  int kProjSize,
+                                  int vProjSize,
+                                  int oProjSize,
+                                  int qoSeqLength,
+                                  int kvSeqLength,
+                                  bool add_bias_kv);
+
+void gpu_forward_kernel(ffStream_t stream,
+                        MHAPerDeviceState const &device_state,
+                        float const *query_ptr,
+                        float const *key_ptr,
+                        float const *value_ptr,
+                        float const *weight_ptr,
+                        float *output_ptr);
+
+void gpu_backward_kernel(ffStream_t stream,
+                         MHAPerDeviceState const &device_state,
+                         float const *query_ptr,
+                         float *query_grad_ptr,
+                         float const *key_ptr,
+                         float *key_grad_ptr,
+                         float const *value_ptr,
+                         float *value_grad_ptr,
+                         float const *weight_ptr,
+                         float *weight_grad_ptr,
+                         float const *output_grad_ptr);
+
+void gpu_cleanup_kernel(Allocator &allocator,
+                        MHAPerDeviceState const &device_state);
+
+} // namespace FlexFlow::Kernels::MultiHeadAttention
+
+#endif
diff --git a/lib/kernels/include/kernels/batch_matmul_kernels.h b/lib/kernels/include/kernels/batch_matmul_kernels.h
index 8b67f564d2..db377162b6 100644
--- a/lib/kernels/include/kernels/batch_matmul_kernels.h
+++ b/lib/kernels/include/kernels/batch_matmul_kernels.h
@@ -1,14 +1,14 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_BATCH_MATMUL_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_BATCH_MATMUL_KERNELS_H
 
-#include "kernels/allocation.h"
-#include "kernels/device.h"
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/device_stream_t.dtg.h"
 #include "kernels/ff_handle.h"
 
 namespace FlexFlow::Kernels::BatchMatmul {
 
-void forward_kernel(ffStream_t stream,
-                    PerDeviceFFHandle const &handle,
+void forward_kernel(device_stream_t const &stream,
+                    device_handle_t const &handle,
                     float *output_ptr,
                     float const *a_input_ptr,
                     float const *b_input_ptr,
@@ -20,8 +20,8 @@ void forward_kernel(ffStream_t stream,
                     int a_seq_length_dim,
                     int b_seq_length_dim);
 
-void backward_kernel(ffStream_t stream,
-                     PerDeviceFFHandle const &handle,
+void backward_kernel(device_stream_t const &stream,
+                     device_handle_t const &handle,
                      float const *o_ptr,
                      float const *o_grad_ptr,
                      float const *a_ptr,
diff --git a/lib/kernels/include/kernels/batch_matmul_kernels_cpu.h b/lib/kernels/include/kernels/batch_matmul_kernels_cpu.h
new file mode 100644
index 0000000000..fdef3d7fa1
--- /dev/null
+++ b/lib/kernels/include/kernels/batch_matmul_kernels_cpu.h
@@ -0,0 +1,32 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_BATCH_MATMUL_KERNELS_CPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_BATCH_MATMUL_KERNELS_CPU_H
+
+#include "kernels/allocation.h"
+
+namespace FlexFlow::Kernels::BatchMatmul {
+
+void cpu_forward_kernel(float *output_ptr,
+                        float const *a_input_ptr,
+                        float const *b_input_ptr,
+                        int m,
+                        int n,
+                        int k,
+                        int batch,
+                        int seq_length,
+                        int a_seq_length_dim,
+                        int b_seq_length_dim);
+
+void cpu_backward_kernel(float const *o_ptr,
+                         float const *o_grad_ptr,
+                         float const *a_ptr,
+                         float *a_grad_ptr,
+                         float const *b_ptr,
+                         float *b_grad_ptr,
+                         int m,
+                         int n,
+                         int k,
+                         int batch);
+
+} // namespace FlexFlow::Kernels::BatchMatmul
+
+#endif
diff --git a/lib/kernels/include/kernels/batch_matmul_kernels_gpu.h b/lib/kernels/include/kernels/batch_matmul_kernels_gpu.h
new file mode 100644
index 0000000000..4a35c000c3
--- /dev/null
+++ b/lib/kernels/include/kernels/batch_matmul_kernels_gpu.h
@@ -0,0 +1,38 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_BATCH_MATMUL_KERNELS_GPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_BATCH_MATMUL_KERNELS_GPU_H
+
+#include "kernels/allocation.h"
+#include "kernels/device.h"
+#include "kernels/ff_handle.h"
+
+namespace FlexFlow::Kernels::BatchMatmul {
+
+void gpu_forward_kernel(ffStream_t stream,
+                        PerDeviceFFHandle const &handle,
+                        float *output_ptr,
+                        float const *a_input_ptr,
+                        float const *b_input_ptr,
+                        int m,
+                        int n,
+                        int k,
+                        int batch,
+                        int seq_length,
+                        int a_seq_length_dim,
+                        int b_seq_length_dim);
+
+void gpu_backward_kernel(ffStream_t stream,
+                         PerDeviceFFHandle const &handle,
+                         float const *o_ptr,
+                         float const *o_grad_ptr,
+                         float const *a_ptr,
+                         float *a_grad_ptr,
+                         float const *b_ptr,
+                         float *b_grad_ptr,
+                         int m,
+                         int n,
+                         int k,
+                         int batch);
+
+} // namespace FlexFlow::Kernels::BatchMatmul
+
+#endif
diff --git a/lib/kernels/include/kernels/batch_norm_kernels.h b/lib/kernels/include/kernels/batch_norm_kernels.h
index 9bb2753a12..47cb3d85a8 100644
--- a/lib/kernels/include/kernels/batch_norm_kernels.h
+++ b/lib/kernels/include/kernels/batch_norm_kernels.h
@@ -3,29 +3,31 @@
 
 #include "kernels/allocation.h"
 #include "kernels/batch_norm_per_device_state.dtg.h"
-#include "kernels/device.h"
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/device_stream_t.dtg.h"
 #include "kernels/ff_handle.h"
-#include <memory>
 
 namespace FlexFlow::Kernels::BatchNorm {
 
-BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle,
-                                    Allocator allocator,
-                                    float *runningMean,
-                                    int output_n,
-                                    int output_c,
-                                    int output_h,
-                                    int output_w,
-                                    bool relu);
+std::optional<BatchNormPerDeviceState>
+    init_kernel(DeviceType device_type,
+                device_handle_t const &handle,
+                Allocator &allocator,
+                float *runningMean,
+                int output_n,
+                int output_c,
+                int output_h,
+                int output_w,
+                bool relu);
 
-void forward_kernel(ffStream_t stream,
-                    BatchNormPerDeviceState const &per_device_statem,
+void forward_kernel(device_stream_t const &stream,
+                    BatchNormPerDeviceState const &per_device_state,
                     float const *input_ptr,
                     float *output_ptr,
                     float const *scale_ptr,
                     float const *bias_ptr);
 
-void backward_kernel(ffStream_t stream,
+void backward_kernel(device_stream_t const &stream,
                      BatchNormPerDeviceState const &per_device_state,
                      float const *output_ptr,
                      float *output_grad_ptr,
@@ -36,13 +38,10 @@ void backward_kernel(ffStream_t stream,
                      float *bias_grad_ptr,
                      size_t numElements);
 
-void cleanup_kernel(Allocator allocator,
-                    ffTensorDescriptor_t inputTensor,
-                    ffTensorDescriptor_t biasTensor,
-                    ffTensorDescriptor_t outputTensor,
-                    ffActivationDescriptor_t actiDesc,
-                    bool relu,
-                    float *runningMean);
+void cleanup_kernel(
+    DeviceType device_type,
+    Allocator &allocator,
+    std::optional<BatchNormPerDeviceState> const &per_device_state);
 
 } // namespace FlexFlow::Kernels::BatchNorm
 #endif
diff --git a/lib/kernels/include/kernels/batch_norm_kernels_cpu.h b/lib/kernels/include/kernels/batch_norm_kernels_cpu.h
new file mode 100644
index 0000000000..8c564d6450
--- /dev/null
+++ b/lib/kernels/include/kernels/batch_norm_kernels_cpu.h
@@ -0,0 +1,28 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_BATCH_NORM_KERNELS_CPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_BATCH_NORM_KERNELS_CPU_H
+
+#include "kernels/allocation.h"
+#include "kernels/batch_norm_per_device_state.dtg.h"
+#include "kernels/device_stream_t.dtg.h"
+
+namespace FlexFlow::Kernels::BatchNorm {
+
+void cpu_forward_kernel(BatchNormPerDeviceState const &per_device_state,
+                        float const *input_ptr,
+                        float *output_ptr,
+                        float const *scale_ptr,
+                        float const *bias_ptr);
+
+void cpu_backward_kernel(BatchNormPerDeviceState const &per_device_state,
+                         float const *output_ptr,
+                         float *output_grad_ptr,
+                         float const *input_ptr,
+                         float *input_grad_ptr,
+                         float const *scale_ptr,
+                         float *scale_grad_ptr,
+                         float *bias_grad_ptr,
+                         size_t numElements);
+
+} // namespace FlexFlow::Kernels::BatchNorm
+
+#endif
diff --git a/lib/kernels/include/kernels/batch_norm_kernels_gpu.h b/lib/kernels/include/kernels/batch_norm_kernels_gpu.h
new file mode 100644
index 0000000000..41f9808bff
--- /dev/null
+++ b/lib/kernels/include/kernels/batch_norm_kernels_gpu.h
@@ -0,0 +1,43 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_BATCH_NORM_KERNELS_GPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_BATCH_NORM_KERNELS_GPU_H
+
+#include "kernels/allocation.h"
+#include "kernels/batch_norm_per_device_state.dtg.h"
+#include "kernels/device.h"
+#include "kernels/ff_handle.h"
+
+namespace FlexFlow::Kernels::BatchNorm {
+
+BatchNormPerDeviceState gpu_init_kernel(PerDeviceFFHandle const &handle,
+                                        Allocator &allocator,
+                                        float *runningMean,
+                                        int output_n,
+                                        int output_c,
+                                        int output_h,
+                                        int output_w,
+                                        bool relu);
+
+void gpu_forward_kernel(ffStream_t stream,
+                        BatchNormPerDeviceState const &per_device_statem,
+                        float const *input_ptr,
+                        float *output_ptr,
+                        float const *scale_ptr,
+                        float const *bias_ptr);
+
+void gpu_backward_kernel(ffStream_t stream,
+                         BatchNormPerDeviceState const &per_device_state,
+                         float const *output_ptr,
+                         float *output_grad_ptr,
+                         float const *input_ptr,
+                         float *input_grad_ptr,
+                         float const *scale_ptr,
+                         float *scale_grad_ptr,
+                         float *bias_grad_ptr,
+                         size_t numElements);
+
+void gpu_cleanup_kernel(Allocator &allocator,
+                        BatchNormPerDeviceState &per_device_state);
+
+} // namespace FlexFlow::Kernels::BatchNorm
+
+#endif
diff --git a/lib/kernels/include/kernels/cast_kernels.h b/lib/kernels/include/kernels/cast_kernels.h
index 5ec4cb3975..adc64970a1 100644
--- a/lib/kernels/include/kernels/cast_kernels.h
+++ b/lib/kernels/include/kernels/cast_kernels.h
@@ -2,17 +2,17 @@
 #define _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_H
 
 #include "kernels/accessor.h"
-#include "kernels/device.h"
+#include "kernels/device_stream_t.dtg.h"
 
 namespace FlexFlow::Kernels::Cast {
 
-void forward_kernel(ffStream_t stream,
+void forward_kernel(device_stream_t const &stream,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorW const &output);
 
-void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorR const &output,
-                     GenericTensorAccessorW const &input);
+void backward_kernel(device_stream_t const &stream,
+                     GenericTensorAccessorR const &output_grad,
+                     GenericTensorAccessorW const &input_grad);
 
 } // namespace FlexFlow::Kernels::Cast
 
diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h
index 343ba253d9..2b3d03f097 100644
--- a/lib/kernels/include/kernels/cast_kernels_cpu.h
+++ b/lib/kernels/include/kernels/cast_kernels_cpu.h
@@ -2,15 +2,14 @@
 #define _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H
 
 #include "kernels/accessor.h"
-#include "kernels/device.h"
 
 namespace FlexFlow::Kernels::Cast {
 
 void cpu_forward_kernel(GenericTensorAccessorR const &input,
                         GenericTensorAccessorW const &output);
 
-void cpu_backward_kernel(GenericTensorAccessorR const &output,
-                         GenericTensorAccessorW const &input);
+void cpu_backward_kernel(GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorW const &input_grad);
 
 } // namespace FlexFlow::Kernels::Cast
 
diff --git a/lib/kernels/include/kernels/cast_kernels_gpu.h b/lib/kernels/include/kernels/cast_kernels_gpu.h
new file mode 100644
index 0000000000..47336804e9
--- /dev/null
+++ b/lib/kernels/include/kernels/cast_kernels_gpu.h
@@ -0,0 +1,19 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_CAST_KERNELS_GPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_CAST_KERNELS_GPU_H
+
+#include "kernels/accessor.h"
+#include "kernels/device.h"
+
+namespace FlexFlow::Kernels::Cast {
+
+void gpu_forward_kernel(ffStream_t stream,
+                        GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output);
+
+void gpu_backward_kernel(ffStream_t stream,
+                         GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorW const &input_grad);
+
+} // namespace FlexFlow::Kernels::Cast
+
+#endif
diff --git a/lib/kernels/include/kernels/combine_kernels.h b/lib/kernels/include/kernels/combine_kernels.h
deleted file mode 100644
index c87465a01f..0000000000
--- a/lib/kernels/include/kernels/combine_kernels.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H
-#define _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H
-
-#include "kernels/accessor.h"
-#include "kernels/device.h"
-
-namespace FlexFlow::Kernels::Combine {
-
-void forward_kernel(ffStream_t stream,
-                    GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output);
-
-void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorR const &output_grad,
-                     GenericTensorAccessorW const &input_grad);
-
-} // namespace FlexFlow::Kernels::Combine
-
-#endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H
diff --git a/lib/kernels/include/kernels/combine_kernels_cpu.h b/lib/kernels/include/kernels/combine_kernels_cpu.h
deleted file mode 100644
index 75fdd56498..0000000000
--- a/lib/kernels/include/kernels/combine_kernels_cpu.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H
-#define _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H
-
-#include "kernels/accessor.h"
-#include "kernels/device.h"
-
-namespace FlexFlow::Kernels::Combine {
-
-void cpu_forward_kernel(GenericTensorAccessorR const &input,
-                        GenericTensorAccessorW const &output);
-
-void cpu_backward_kernel(GenericTensorAccessorR const &output_grad,
-                         GenericTensorAccessorW const &input_grad);
-
-} // namespace FlexFlow::Kernels::Combine
-
-#endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H
diff --git a/lib/kernels/include/kernels/concat_kernels.h b/lib/kernels/include/kernels/concat_kernels.h
index 1e3c55bf59..793bf52505 100644
--- a/lib/kernels/include/kernels/concat_kernels.h
+++ b/lib/kernels/include/kernels/concat_kernels.h
@@ -2,16 +2,16 @@
 #define _FLEXFLOW_OPS_KERNELS_CONCAT_KERNELS_H
 
 #include "kernels/accessor.h"
-#include "kernels/device.h"
+#include "kernels/device_stream_t.dtg.h"
 
 namespace FlexFlow::Kernels::Concat {
 
-void forward_kernel(ffStream_t stream,
+void forward_kernel(device_stream_t const &stream,
                     GenericTensorAccessorW const &output,
                     std::vector<GenericTensorAccessorR> const &inputs,
                     ff_dim_t axis);
 
-void backward_kernel(ffStream_t stream,
+void backward_kernel(device_stream_t const &stream,
                      GenericTensorAccessorR const &output_grad,
                      std::vector<GenericTensorAccessorW> const &input_grads,
                      ff_dim_t axis);
diff --git a/lib/kernels/include/kernels/concat_kernels_cpu.h b/lib/kernels/include/kernels/concat_kernels_cpu.h
new file mode 100644
index 0000000000..4a7f9fd3c8
--- /dev/null
+++ b/lib/kernels/include/kernels/concat_kernels_cpu.h
@@ -0,0 +1,19 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_CONCAT_KERNELS_CPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_CONCAT_KERNELS_CPU_H
+
+#include "kernels/accessor.h"
+#include "kernels/device.h"
+
+namespace FlexFlow::Kernels::Concat {
+
+void cpu_forward_kernel(GenericTensorAccessorW const &output,
+                        std::vector<GenericTensorAccessorR> const &inputs,
+                        ff_dim_t axis);
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output_grad,
+                         std::vector<GenericTensorAccessorW> const &input_grads,
+                         ff_dim_t axis);
+
+} // namespace FlexFlow::Kernels::Concat
+
+#endif
diff --git a/lib/kernels/include/kernels/concat_kernels_gpu.h b/lib/kernels/include/kernels/concat_kernels_gpu.h
new file mode 100644
index 0000000000..3aaf3fbe2c
--- /dev/null
+++ b/lib/kernels/include/kernels/concat_kernels_gpu.h
@@ -0,0 +1,21 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_CONCAT_KERNELS_GPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_CONCAT_KERNELS_GPU_H
+
+#include "kernels/accessor.h"
+#include "kernels/device.h"
+
+namespace FlexFlow::Kernels::Concat {
+
+void gpu_forward_kernel(ffStream_t stream,
+                        GenericTensorAccessorW const &output,
+                        std::vector<GenericTensorAccessorR> const &inputs,
+                        ff_dim_t axis);
+
+void gpu_backward_kernel(ffStream_t stream,
+                         GenericTensorAccessorR const &output_grad,
+                         std::vector<GenericTensorAccessorW> const &input_grads,
+                         ff_dim_t axis);
+
+} // namespace FlexFlow::Kernels::Concat
+
+#endif
diff --git a/lib/kernels/include/kernels/conv_2d_kernels.h b/lib/kernels/include/kernels/conv_2d_kernels.h
index 3b7c0672df..eb7cd7327a 100644
--- a/lib/kernels/include/kernels/conv_2d_kernels.h
+++ b/lib/kernels/include/kernels/conv_2d_kernels.h
@@ -2,74 +2,53 @@
 #define _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H
 
 #include "kernels/accessor.h"
-#include "kernels/device.h"
+#include "kernels/conv_2d_per_device_state.dtg.h"
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/device_stream_t.dtg.h"
 #include "kernels/ff_handle.h"
 #include "op-attrs/activation.dtg.h"
-#include "utils/visitable.h"
 
-namespace FlexFlow {
-
-struct Conv2DPerDeviceState {
-  PerDeviceFFHandle handle;
-  ffTensorDescriptor_t inputTensor;
-  ffTensorDescriptor_t biasTensor;
-  ffTensorDescriptor_t outputTensor;
-  ffFilterDescriptor_t filterDesc;
-  ffActivationDescriptor_t actiDesc;
-  ffConvolutionDescriptor_t convDesc;
-  ffConvolutionFwdAlgo_t fwdAlgo;
-  ffConvolutionBwdFilterAlgo_t bwdFilterAlgo;
-  req<ffConvolutionBwdDataAlgo_t> bwdDataAlgo;
-};
-
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Conv2DPerDeviceState,
-                                             handle,
-                                             inputTensor,
-                                             biasTensor,
-                                             outputTensor,
-                                             filterDesc,
-                                             actiDesc,
-                                             convDesc,
-                                             fwdAlgo,
-                                             bwdFilterAlgo,
-                                             bwdDataAlgo);
-
-namespace Kernels::Conv2D {
-
-Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle,
-                                 std::optional<Activation> activation,
-                                 int kernel_h,
-                                 int kernel_w,
-                                 int groups,
-                                 int padding_h,
-                                 int padding_w,
-                                 int stride_h,
-                                 int stride_w,
-                                 GenericTensorAccessorW const &input,
-                                 GenericTensorAccessorW const &output,
-                                 float const *filter_ptr,
-                                 float *filter_grad_ptr);
-
-void forward_kernel(ffStream_t stream,
-                    Conv2DPerDeviceState const &m,
+namespace FlexFlow::Kernels::Conv2D {
+
+std::optional<Conv2DPerDeviceState>
+    init_kernel(DeviceType device_type,
+                device_handle_t const &handle,
+                std::optional<Activation> activation,
+                int kernel_h,
+                int kernel_w,
+                int groups,
+                int padding_h,
+                int padding_w,
+                int stride_h,
+                int stride_w,
+                GenericTensorAccessorW const &input,
+                GenericTensorAccessorW const &output,
+                float const *filter_ptr,
+                float *filter_grad_ptr);
+
+void forward_kernel(device_stream_t const &stream,
+                    std::optional<Conv2DPerDeviceState> const &per_device_state,
                     float const *input_ptr,
                     float *output_ptr,
                     float const *filter_ptr,
                     float const *bias_ptr,
                     std::optional<Activation> activation);
 
-void backward_kernel(ffStream_t stream,
-                     Conv2DPerDeviceState const &m,
-                     float const *output_ptr,
-                     float *output_grad_ptr,
-                     float const *input_ptr,
-                     float *input_grad_ptr,
-                     float const *filter_ptr,
-                     float *filter_grad_ptr,
-                     float *bias_grad_ptr,
-                     std::optional<Activation> activation);
-
-} // namespace Kernels::Conv2D
-} // namespace FlexFlow
+void backward_kernel(
+    device_stream_t const &stream,
+    std::optional<Conv2DPerDeviceState> const &per_device_state,
+    float const *output_ptr,
+    float *output_grad_ptr,
+    float const *input_ptr,
+    float *input_grad_ptr,
+    float const *filter_ptr,
+    float *filter_grad_ptr,
+    float *bias_grad_ptr,
+    std::optional<Activation> activation);
+
+void cleanup_kernel(DeviceType device_type,
+                    std::optional<Conv2DPerDeviceState> &per_device_state);
+
+} // namespace FlexFlow::Kernels::Conv2D
 
 #endif // _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H
diff --git a/lib/kernels/include/kernels/conv_2d_kernels_cpu.h b/lib/kernels/include/kernels/conv_2d_kernels_cpu.h
new file mode 100644
index 0000000000..3a783a395f
--- /dev/null
+++ b/lib/kernels/include/kernels/conv_2d_kernels_cpu.h
@@ -0,0 +1,26 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_CONV_2D_KERNELS_CPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_CONV_2D_KERNELS_CPU_H
+
+#include "op-attrs/activation.dtg.h"
+#include <optional>
+
+namespace FlexFlow::Kernels::Conv2D {
+
+void cpu_forward_kernel(float const *input_ptr,
+                        float *output_ptr,
+                        float const *filter_ptr,
+                        float const *bias_ptr,
+                        std::optional<Activation> const &activation);
+
+void cpu_backward_kernel(float const *output_ptr,
+                         float *output_grad_ptr,
+                         float const *input_ptr,
+                         float *input_grad_ptr,
+                         float const *filter_ptr,
+                         float *filter_grad_ptr,
+                         float *bias_grad_ptr,
+                         std::optional<Activation> const &activation);
+
+} // namespace FlexFlow::Kernels::Conv2D
+
+#endif
diff --git a/lib/kernels/include/kernels/conv_2d_kernels_gpu.h b/lib/kernels/include/kernels/conv_2d_kernels_gpu.h
new file mode 100644
index 0000000000..9084838e9d
--- /dev/null
+++ b/lib/kernels/include/kernels/conv_2d_kernels_gpu.h
@@ -0,0 +1,44 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_CONV_2D_KERNELS_GPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_CONV_2D_KERNELS_GPU_H
+
+namespace FlexFlow::Kernels::Conv2D {
+
+Conv2DPerDeviceState
+    gpu_init_kernel(PerDeviceFFHandle const &handle,
+                    std::optional<Activation> const &activation,
+                    int kernel_h,
+                    int kernel_w,
+                    int groups,
+                    int padding_h,
+                    int padding_w,
+                    int stride_h,
+                    int stride_w,
+                    GenericTensorAccessorW const &input,
+                    GenericTensorAccessorW const &output,
+                    float const *filter_ptr,
+                    float *filter_grad_ptr);
+
+void gpu_forward_kernel(ffStream_t stream,
+                        Conv2DPerDeviceState const &m,
+                        float const *input_ptr,
+                        float *output_ptr,
+                        float const *filter_ptr,
+                        float const *bias_ptr,
+                        std::optional<Activation> activation);
+
+void gpu_backward_kernel(ffStream_t stream,
+                         Conv2DPerDeviceState const &m,
+                         float const *output_ptr,
+                         float *output_grad_ptr,
+                         float const *input_ptr,
+                         float *input_grad_ptr,
+                         float const *filter_ptr,
+                         float *filter_grad_ptr,
+                         float *bias_grad_ptr,
+                         std::optional<Activation> activation);
+
+void gpu_cleanup_kernel(Conv2DPerDeviceState &per_device_state);
+
+} // namespace FlexFlow::Kernels::Conv2D
+
+#endif
diff --git a/lib/kernels/include/kernels/conv_2d_per_device_state.struct.toml b/lib/kernels/include/kernels/conv_2d_per_device_state.struct.toml
new file mode 100644
index 0000000000..d76dbc89d0
--- /dev/null
+++ b/lib/kernels/include/kernels/conv_2d_per_device_state.struct.toml
@@ -0,0 +1,48 @@
+namespace = "FlexFlow"
+name = "Conv2DPerDeviceState"
+features = []
+
+includes = [
+  "kernels/device.h",
+  "kernels/ff_handle.h",
+]
+
+[[fields]]
+name = "handle"
+type = "::FlexFlow::PerDeviceFFHandle"
+
+[[fields]]
+name = "inputTensor"
+type = "ffTensorDescriptor_t"
+
+[[fields]]
+name = "biasTensor"
+type = "ffTensorDescriptor_t"
+
+[[fields]]
+name = "outputTensor"
+type = "ffTensorDescriptor_t"
+
+[[fields]]
+name = "filterDesc"
+type = "ffFilterDescriptor_t"
+
+[[fields]]
+name = "actiDesc"
+type = "ffActivationDescriptor_t"
+
+[[fields]]
+name = "convDesc"
+type = "ffConvolutionDescriptor_t"
+
+[[fields]]
+name = "fwdAlgo"
+type = "ffConvolutionFwdAlgo_t"
+
+[[fields]]
+name = "bwdFilterAlgo"
+type = "ffConvolutionBwdFilterAlgo_t"
+
+[[fields]]
+name = "bwdDataAlgo"
+type = "ffConvolutionBwdDataAlgo_t"
diff --git a/lib/kernels/include/kernels/copy_tensor_accessor.h b/lib/kernels/include/kernels/copy_tensor_accessor.h
index 81fd59dafb..ef2254071e 100644
--- a/lib/kernels/include/kernels/copy_tensor_accessor.h
+++ b/lib/kernels/include/kernels/copy_tensor_accessor.h
@@ -6,7 +6,7 @@
 
 namespace FlexFlow {
 
-GenericTensorAccessorR
+GenericTensorAccessorW
     copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor,
                            Allocator &allocator);
 
diff --git a/lib/kernels/include/kernels/create_accessor_with_contents.h b/lib/kernels/include/kernels/create_accessor_with_contents.h
index 9691b0c90a..3574ad0c88 100644
--- a/lib/kernels/include/kernels/create_accessor_with_contents.h
+++ b/lib/kernels/include/kernels/create_accessor_with_contents.h
@@ -25,8 +25,8 @@ GenericTensorAccessorW
 
   for (nonnegative_int col_idx :
        nonnegative_range(ncols.nonnegative_int_from_positive_int())) {
-    cpu_accessor.at<type_to_data_type_enum_v<T>>(FFOrdered{col_idx}) =
-        contents.at(col_idx.unwrap_nonnegative());
+    cpu_accessor.at<type_to_data_type_enum_v<T>>(TensorDimsCoord{
+        FFOrdered{col_idx}}) = contents.at(col_idx.unwrap_nonnegative());
   }
 
   GenericTensorAccessorW result = allocator.allocate_tensor(shape);
@@ -58,9 +58,10 @@ GenericTensorAccessorW create_2d_accessor_w_with_contents(
        nonnegative_range(nrows.nonnegative_int_from_positive_int())) {
     for (nonnegative_int col_idx :
          nonnegative_range(ncols.nonnegative_int_from_positive_int())) {
-      cpu_accessor.at<type_to_data_type_enum_v<T>>(FFOrdered{
-          row_idx, col_idx}) = contents.at(row_idx.unwrap_nonnegative())
-                                   .at(col_idx.unwrap_nonnegative());
+      cpu_accessor.at<type_to_data_type_enum_v<T>>(
+          TensorDimsCoord{FFOrdered{row_idx, col_idx}}) =
+          contents.at(row_idx.unwrap_nonnegative())
+              .at(col_idx.unwrap_nonnegative());
     }
   }
 
@@ -105,7 +106,7 @@ GenericTensorAccessorW create_3d_accessor_w_with_contents(
       for (nonnegative_int dim2_idx :
            nonnegative_range(dim2_size.nonnegative_int_from_positive_int())) {
         cpu_accessor.at<type_to_data_type_enum_v<T>>(
-            FFOrdered{dim0_idx, dim1_idx, dim2_idx}) =
+            TensorDimsCoord{FFOrdered{dim0_idx, dim1_idx, dim2_idx}}) =
             contents.at(dim0_idx.unwrap_nonnegative())
                 .at(dim1_idx.unwrap_nonnegative())
                 .at(dim2_idx.unwrap_nonnegative());
@@ -165,8 +166,8 @@ GenericTensorAccessorW create_4d_accessor_w_with_contents(
            nonnegative_range(dim2_size.nonnegative_int_from_positive_int())) {
         for (nonnegative_int dim3_idx :
              nonnegative_range(dim3_size.nonnegative_int_from_positive_int())) {
-          accessor.at<type_to_data_type_enum_v<T>>(
-              FFOrdered{dim0_idx, dim1_idx, dim2_idx, dim3_idx}) =
+          accessor.at<type_to_data_type_enum_v<T>>(TensorDimsCoord{
+              FFOrdered{dim0_idx, dim1_idx, dim2_idx, dim3_idx}}) =
               contents.at(dim0_idx.unwrap_nonnegative())
                   .at(dim1_idx.unwrap_nonnegative())
                   .at(dim2_idx.unwrap_nonnegative())
diff --git a/lib/kernels/include/kernels/create_local_allocator_for_device_type.h b/lib/kernels/include/kernels/create_local_allocator_for_device_type.h
new file mode 100644
index 0000000000..16c35f86fd
--- /dev/null
+++ b/lib/kernels/include/kernels/create_local_allocator_for_device_type.h
@@ -0,0 +1,12 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ALLOCATOR_FOR_DEVICE_TYPE_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ALLOCATOR_FOR_DEVICE_TYPE_H
+
+#include "kernels/allocation.h"
+
+namespace FlexFlow {
+
+Allocator create_local_allocator_for_device_type(DeviceType);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/device_handle_t.h b/lib/kernels/include/kernels/device_handle_t.h
new file mode 100644
index 0000000000..9b7769355e
--- /dev/null
+++ b/lib/kernels/include/kernels/device_handle_t.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DEVICE_HANDLE_T_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DEVICE_HANDLE_T_H
+
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/managed_per_device_ff_handle.h"
+
+namespace FlexFlow {
+
+device_handle_t device_handle_t_from_managed_handle(
+    std::optional<ManagedPerDeviceFFHandle> const &managed_handle);
+
+device_handle_t gpu_make_device_handle_t(PerDeviceFFHandle const &ff_handle);
+device_handle_t cpu_make_device_handle_t();
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/device_handle_t.variant.toml b/lib/kernels/include/kernels/device_handle_t.variant.toml
new file mode 100644
index 0000000000..ef574e0745
--- /dev/null
+++ b/lib/kernels/include/kernels/device_handle_t.variant.toml
@@ -0,0 +1,16 @@
+namespace = "FlexFlow"
+name = "device_handle_t"
+features = []
+
+includes = [
+  "<utility>",
+  "kernels/ff_handle.h",
+]
+
+[[values]]
+type = "::FlexFlow::PerDeviceFFHandle"
+key = "for_gpu"
+
+[[values]]
+type = "std::monostate"
+key = "for_cpu"
diff --git a/lib/kernels/include/kernels/device_stream_t.h b/lib/kernels/include/kernels/device_stream_t.h
new file mode 100644
index 0000000000..2a9b2313f6
--- /dev/null
+++ b/lib/kernels/include/kernels/device_stream_t.h
@@ -0,0 +1,15 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_DEVICE_STREAM_T_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_DEVICE_STREAM_T_H
+
+#include "kernels/device_stream_t.dtg.h"
+#include "pcg/device_type.dtg.h"
+
+namespace FlexFlow {
+
+device_stream_t get_gpu_device_stream();
+device_stream_t get_cpu_device_stream();
+device_stream_t get_stream_for_device_type(DeviceType);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/device_stream_t.variant.toml b/lib/kernels/include/kernels/device_stream_t.variant.toml
new file mode 100644
index 0000000000..b3f8e77171
--- /dev/null
+++ b/lib/kernels/include/kernels/device_stream_t.variant.toml
@@ -0,0 +1,16 @@
+namespace = "FlexFlow" 
+name = "device_stream_t"
+features = []
+
+includes = [
+  "<utility>",
+  "kernels/device.h",
+]
+
+[[values]]
+type = "ffStream_t"
+key = "gpu"
+
+[[values]]
+type = "std::monostate"
+key = "cpu"
diff --git a/lib/kernels/include/kernels/dropout_kernels.h b/lib/kernels/include/kernels/dropout_kernels.h
index 2cc6dd60a3..39f7238114 100644
--- a/lib/kernels/include/kernels/dropout_kernels.h
+++ b/lib/kernels/include/kernels/dropout_kernels.h
@@ -2,60 +2,37 @@
 #define _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H
 
 #include "kernels/allocation.h"
-#include "kernels/array_shape.h"
-#include "kernels/device.h"
-#include "kernels/ff_handle.h"
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/device_stream_t.dtg.h"
+#include "kernels/dropout_per_device_state.dtg.h"
 #include <cstddef>
 
-namespace FlexFlow {
-
-struct DropoutPerDeviceState {
-public:
-  PerDeviceFFHandle handle;
-  ffTensorDescriptor_t inputTensor;
-  ffTensorDescriptor_t outputTensor;
-  ffDropoutDescriptor_t dropoutDesc;
-  void *reserveSpace;
-  void *dropoutStates;
-  size_t reserveSpaceSize;
-  req<size_t> dropoutStateSize;
-};
-
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(DropoutPerDeviceState,
-                                             handle,
-                                             inputTensor,
-                                             outputTensor,
-                                             dropoutDesc,
-                                             reserveSpace,
-                                             dropoutStates,
-                                             reserveSpaceSize,
-                                             dropoutStateSize);
-
-namespace Kernels::Dropout {
-
-DropoutPerDeviceState init_kernel(PerDeviceFFHandle handle,
-                                  float rate,
-                                  unsigned long long seed,
-                                  ArrayShape const &output_domain,
-                                  Allocator allocator);
-
-void forward_kernel(ffStream_t stream,
-                    DropoutPerDeviceState const &m,
-                    float const *input_ptr,
-                    float *output_ptr);
-
-void backward_kernel(ffStream_t stream,
-                     DropoutPerDeviceState const &m,
-                     float const *output_grad_ptr,
-                     float *input_grad_ptr);
-
-void cleanup_kernel(Allocator allocator,
-                    ffTensorDescriptor_t inputTensor,
-                    ffTensorDescriptor_t outputTensor,
-                    ffDropoutDescriptor_t dropoutDesc,
-                    void *dropoutStates);
-
-} // namespace Kernels::Dropout
-} // namespace FlexFlow
+namespace FlexFlow::Kernels::Dropout {
+
+std::optional<DropoutPerDeviceState>
+    init_kernel(DeviceType device_type,
+                device_handle_t const &handle,
+                float rate,
+                unsigned long long seed,
+                TensorShape const &output_shape,
+                Allocator &allocator);
+
+void forward_kernel(
+    device_stream_t const &stream,
+    std::optional<DropoutPerDeviceState> const &per_device_state,
+    float const *input_ptr,
+    float *output_ptr);
+
+void backward_kernel(
+    device_stream_t const &stream,
+    std::optional<DropoutPerDeviceState> const &per_device_state,
+    float const *output_grad_ptr,
+    float *input_grad_ptr);
+
+void cleanup_kernel(DeviceType device_type,
+                    Allocator &allocator,
+                    std::optional<DropoutPerDeviceState> &per_device_state);
+
+} // namespace FlexFlow::Kernels::Dropout
 
 #endif // _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H
diff --git a/lib/kernels/include/kernels/dropout_kernels_cpu.h b/lib/kernels/include/kernels/dropout_kernels_cpu.h
new file mode 100644
index 0000000000..8d107a8b82
--- /dev/null
+++ b/lib/kernels/include/kernels/dropout_kernels_cpu.h
@@ -0,0 +1,12 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_DROPOUT_KERNELS_CPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_DROPOUT_KERNELS_CPU_H
+
+namespace FlexFlow::Kernels::Dropout {
+
+void cpu_forward_kernel(float const *input_ptr, float *output_ptr);
+
+void cpu_backward_kernel(float const *output_grad_ptr, float *input_grad_ptr);
+
+} // namespace FlexFlow::Kernels::Dropout
+
+#endif
diff --git a/lib/kernels/include/kernels/dropout_kernels_gpu.h b/lib/kernels/include/kernels/dropout_kernels_gpu.h
new file mode 100644
index 0000000000..1e75253499
--- /dev/null
+++ b/lib/kernels/include/kernels/dropout_kernels_gpu.h
@@ -0,0 +1,33 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_DROPOUT_KERNELS_GPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_DROPOUT_KERNELS_GPU_H
+
+#include "kernels/allocation.h"
+#include "kernels/device_stream_t.dtg.h"
+#include "kernels/dropout_per_device_state.dtg.h"
+#include "kernels/ff_handle.h"
+#include <cstddef>
+
+namespace FlexFlow::Kernels::Dropout {
+
+DropoutPerDeviceState gpu_init_kernel(PerDeviceFFHandle const &handle,
+                                      float rate,
+                                      unsigned long long seed,
+                                      TensorShape const &output_shape,
+                                      Allocator &allocator);
+
+void gpu_forward_kernel(ffStream_t stream,
+                        DropoutPerDeviceState const &per_device_state,
+                        float const *input_ptr,
+                        float *output_ptr);
+
+void gpu_backward_kernel(ffStream_t stream,
+                         DropoutPerDeviceState const &per_device_state,
+                         float const *output_grad_ptr,
+                         float *input_grad_ptr);
+
+void gpu_cleanup_kernel(Allocator &allocator,
+                        DropoutPerDeviceState const &per_device_state);
+
+} // namespace FlexFlow::Kernels::Dropout
+
+#endif
diff --git a/lib/kernels/include/kernels/dropout_per_device_state.struct.toml b/lib/kernels/include/kernels/dropout_per_device_state.struct.toml
new file mode 100644
index 0000000000..ffd8bf37e9
--- /dev/null
+++ b/lib/kernels/include/kernels/dropout_per_device_state.struct.toml
@@ -0,0 +1,40 @@
+namespace = "FlexFlow"
+name = "DropoutPerDeviceState"
+features = []
+
+includes = [
+  "kernels/device.h",
+  "kernels/ff_handle.h",
+]
+
+[[fields]]
+name = "handle"
+type = "PerDeviceFFHandle"
+
+[[fields]]
+name = "inputTensor"
+type = "ffTensorDescriptor_t"
+
+[[fields]]
+name = "outputTensor"
+type = "ffTensorDescriptor_t"
+
+[[fields]]
+name = "dropoutDesc"
+type = "ffDropoutDescriptor_t"
+
+[[fields]]
+name = "reserveSpace"
+type = "void *"
+
+[[fields]]
+name = "dropoutStates"
+type = "void *"
+
+[[fields]]
+name = "reserveSpaceSize"
+type = "size_t"
+
+[[fields]]
+name = "dropoutStateSize"
+type = "size_t"
diff --git a/lib/kernels/include/kernels/element_binary_kernels.h b/lib/kernels/include/kernels/element_binary_kernels.h
index fd596f2ccf..8c9a405e6f 100644
--- a/lib/kernels/include/kernels/element_binary_kernels.h
+++ b/lib/kernels/include/kernels/element_binary_kernels.h
@@ -1,63 +1,55 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_ELEMENT_BINARY_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_ELEMENT_BINARY_KERNELS_H
 
-#include "ff_handle.h"
-#include "kernels/array_shape.h"
 #include "kernels/device.h"
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/device_stream_t.dtg.h"
+#include "kernels/element_binary_per_device_state.dtg.h"
+#include "kernels/ff_handle.h"
 #include "op-attrs/datatype.h"
 #include "op-attrs/operator_type.h"
-
-namespace FlexFlow {
-
-struct ElementBinaryPerDeviceState {
-  PerDeviceFFHandle handle;
-  ffTensorDescriptor_t inputLHSTensor;
-  ffTensorDescriptor_t inputRHSTensor;
-  ffTensorDescriptor_t outputTensor;
-  ffOpTensorDescriptor_t opDesc;
-  ffReduceTensorDescriptor_t reduceAddDesc;
-};
-
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(ElementBinaryPerDeviceState,
-                                             handle,
-                                             inputLHSTensor,
-                                             inputRHSTensor,
-                                             outputTensor,
-                                             opDesc,
-                                             reduceAddDesc);
-
-namespace Kernels::ElementBinary {
-
-ElementBinaryPerDeviceState init_kernel(PerDeviceFFHandle handle,
-                                        OperatorType op_type,
-                                        bool should_broadcast_lhs,
-                                        bool should_broadcast_rhs,
-                                        ArrayShape lhs_shape,
-                                        ArrayShape rhs_shape,
-                                        ArrayShape output_shape);
-
-void forward_kernel(ffStream_t stream,
-                    ElementBinaryPerDeviceState const &m,
-                    float const *lhs_ptr,
-                    float const *rhs_ptr,
-                    float *out_ptr,
-                    OperatorType op_type,
-                    bool broadcast_inputLHS,
-                    PerDeviceFFHandle handle);
-
-void backward_kernel(ffStream_t stream,
-                     ElementBinaryPerDeviceState const &m,
-                     float const *out_grad_ptr,
-                     float const *lhs_ptr,
-                     float const *rhs_ptr,
-                     float *lhs_grad_ptr,
-                     float *rhs_grad_ptr,
-                     OperatorType op_type,
-                     bool broadcast_inputLHS,
-                     bool broadcast_inputRHS,
-                     PerDeviceFFHandle handle);
-
-} // namespace Kernels::ElementBinary
-} // namespace FlexFlow
+#include "op-attrs/tensor_shape.dtg.h"
+#include "pcg/device_type.dtg.h"
+
+namespace FlexFlow::Kernels::ElementBinary {
+
+std::optional<ElementBinaryPerDeviceState>
+    init_kernel(DeviceType device_type,
+                device_handle_t const &handle,
+                OperatorType op_type,
+                bool should_broadcast_lhs,
+                bool should_broadcast_rhs,
+                TensorShape const &lhs_shape,
+                TensorShape const &rhs_shape,
+                TensorShape const &output_shape);
+
+void forward_kernel(
+    device_stream_t const &stream,
+    std::optional<ElementBinaryPerDeviceState> const &per_device_state,
+    float const *lhs_ptr,
+    float const *rhs_ptr,
+    float *out_ptr,
+    OperatorType op_type,
+    bool broadcast_inputLHS,
+    device_handle_t const &handle);
+
+void backward_kernel(
+    device_stream_t const &stream,
+    std::optional<ElementBinaryPerDeviceState> const &per_device_state,
+    float const *out_grad_ptr,
+    float const *lhs_ptr,
+    float const *rhs_ptr,
+    float *lhs_grad_ptr,
+    float *rhs_grad_ptr,
+    OperatorType op_type,
+    bool broadcast_inputLHS,
+    bool broadcast_inputRHS,
+    device_handle_t const &handle);
+
+void cleanup_kernel(
+    DeviceType device_type,
+    std::optional<ElementBinaryPerDeviceState> const &per_device_state);
+
+} // namespace FlexFlow::Kernels::ElementBinary
 
 #endif
diff --git a/lib/kernels/include/kernels/element_binary_kernels_cpu.h b/lib/kernels/include/kernels/element_binary_kernels_cpu.h
new file mode 100644
index 0000000000..c53920764c
--- /dev/null
+++ b/lib/kernels/include/kernels/element_binary_kernels_cpu.h
@@ -0,0 +1,25 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ELEMENT_BINARY_KERNELS_CPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ELEMENT_BINARY_KERNELS_CPU_H
+
+#include "op-attrs/operator_type.dtg.h"
+
+namespace FlexFlow::Kernels::ElementBinary {
+
+void cpu_forward_kernel(float const *lhs_ptr,
+                        float const *rhs_ptr,
+                        float *out_ptr,
+                        OperatorType op_type,
+                        bool broadcast_inputLHS);
+
+void cpu_backward_kernel(float const *out_grad_ptr,
+                         float const *lhs_ptr,
+                         float const *rhs_ptr,
+                         float *lhs_grad_ptr,
+                         float *rhs_grad_ptr,
+                         OperatorType op_type,
+                         bool broadcast_inputLHS,
+                         bool broadcast_inputRHS);
+
+} // namespace FlexFlow::Kernels::ElementBinary
+
+#endif
diff --git a/lib/kernels/include/kernels/element_binary_kernels_gpu.h b/lib/kernels/include/kernels/element_binary_kernels_gpu.h
new file mode 100644
index 0000000000..58a06edb4d
--- /dev/null
+++ b/lib/kernels/include/kernels/element_binary_kernels_gpu.h
@@ -0,0 +1,43 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ELEMENT_BINARY_KERNELS_GPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ELEMENT_BINARY_KERNELS_GPU_H
+
+#include "kernels/element_binary_per_device_state.dtg.h"
+#include "op-attrs/operator_type.h"
+#include "op-attrs/tensor_shape.dtg.h"
+
+namespace FlexFlow::Kernels::ElementBinary {
+
+ElementBinaryPerDeviceState gpu_init_kernel(PerDeviceFFHandle handle,
+                                            OperatorType op_type,
+                                            bool should_broadcast_lhs,
+                                            bool should_broadcast_rhs,
+                                            TensorShape const &lhs_shape,
+                                            TensorShape const &rhs_shape,
+                                            TensorShape const &output_shape);
+
+void gpu_forward_kernel(ffStream_t stream,
+                        ElementBinaryPerDeviceState const &per_device_state,
+                        float const *lhs_ptr,
+                        float const *rhs_ptr,
+                        float *out_ptr,
+                        OperatorType op_type,
+                        bool broadcast_inputLHS,
+                        PerDeviceFFHandle handle);
+
+void gpu_backward_kernel(ffStream_t stream,
+                         ElementBinaryPerDeviceState const &per_device_state,
+                         float const *out_grad_ptr,
+                         float const *lhs_ptr,
+                         float const *rhs_ptr,
+                         float *lhs_grad_ptr,
+                         float *rhs_grad_ptr,
+                         OperatorType op_type,
+                         bool broadcast_inputLHS,
+                         bool broadcast_inputRHS,
+                         PerDeviceFFHandle handle);
+
+void gpu_cleanup_kernel(ElementBinaryPerDeviceState const &per_device_state);
+
+} // namespace FlexFlow::Kernels::ElementBinary
+
+#endif
diff --git a/lib/kernels/include/kernels/element_binary_per_device_state.struct.toml b/lib/kernels/include/kernels/element_binary_per_device_state.struct.toml
new file mode 100644
index 0000000000..2cae58f847
--- /dev/null
+++ b/lib/kernels/include/kernels/element_binary_per_device_state.struct.toml
@@ -0,0 +1,32 @@
+namespace = "FlexFlow"
+name = "ElementBinaryPerDeviceState"
+features = []
+
+includes = [
+  "kernels/ff_handle.h",
+  "kernels/device.h",
+]
+
+[[fields]]
+name = "handle"
+type = "::FlexFlow::PerDeviceFFHandle"
+
+[[fields]]
+name = "inputLHSTensor"
+type = "ffTensorDescriptor_t"
+
+[[fields]]
+name = "inputRHSTensor"
+type = "ffTensorDescriptor_t"
+
+[[fields]]
+name = "outputTensor"
+type = "ffTensorDescriptor_t"
+
+[[fields]]
+name = "opDesc"
+type = "ffOpTensorDescriptor_t"
+
+[[fields]]
+name = "reduceAddDesc"
+type = "ffReduceTensorDescriptor_t"
diff --git a/lib/kernels/include/kernels/element_unary_kernels.h b/lib/kernels/include/kernels/element_unary_kernels.h
index 0257b3b4a6..a3fb3a1ae0 100644
--- a/lib/kernels/include/kernels/element_unary_kernels.h
+++ b/lib/kernels/include/kernels/element_unary_kernels.h
@@ -2,46 +2,42 @@
 #define _FLEXFLOW_OPS_KERNELS_ELEMENT_UNARY_KERNELS_H
 
 #include "kernels/accessor.h"
-#include "kernels/device.h"
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/device_stream_t.dtg.h"
+#include "kernels/element_unary_per_device_state.dtg.h"
 #include "kernels/ff_handle.h"
-#include "op-attrs/ops/element_unary.h"
-#include <cstddef>
-
-namespace FlexFlow {
-
-struct ElementUnaryPerDeviceState {
-  ffTensorDescriptor_t inputTensor, outputTensor;
-  req<ffActivationDescriptor_t> actiDesc;
-};
-
-FF_VISITABLE_STRUCT_NO_EQ(ElementUnaryPerDeviceState,
-                          inputTensor,
-                          outputTensor,
-                          actiDesc);
-
-namespace Kernels::ElementUnary {
-
-ElementUnaryPerDeviceState init_kernel(ArrayShape const &input_shape,
-                                       ArrayShape const &output_shape,
-                                       ElementUnaryAttrs const &attrs);
-
-void forward_kernel(ffStream_t stream,
-                    ElementUnaryPerDeviceState const &device_state,
-                    ElementUnaryAttrs const &attrs,
-                    PerDeviceFFHandle const &handle,
-                    GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output);
-
-void backward_kernel(ffStream_t stream,
-                     ElementUnaryPerDeviceState const &device_state,
-                     ElementUnaryAttrs const &attrs,
-                     PerDeviceFFHandle const &handle,
-                     GenericTensorAccessorR const &output,
-                     GenericTensorAccessorR const &output_grad,
-                     GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &input_grad);
-
-} // namespace Kernels::ElementUnary
-} // namespace FlexFlow
+#include "op-attrs/ops/element_unary_attrs.dtg.h"
+
+namespace FlexFlow::Kernels::ElementUnary {
+
+std::optional<ElementUnaryPerDeviceState>
+    init_kernel(DeviceType device_type,
+                TensorShape const &input_shape,
+                TensorShape const &output_shape,
+                ElementUnaryAttrs const &attrs);
+
+void forward_kernel(
+    device_stream_t const &stream,
+    std::optional<ElementUnaryPerDeviceState> const &device_state,
+    ElementUnaryAttrs const &attrs,
+    device_handle_t const &handle,
+    GenericTensorAccessorR const &input,
+    GenericTensorAccessorW const &output);
+
+void backward_kernel(
+    device_stream_t const &stream,
+    std::optional<ElementUnaryPerDeviceState> const &device_state,
+    ElementUnaryAttrs const &attrs,
+    device_handle_t const &handle,
+    GenericTensorAccessorR const &output,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &input,
+    GenericTensorAccessorW const &input_grad);
+
+void cleanup_kernel(
+    DeviceType device_type,
+    std::optional<ElementUnaryPerDeviceState> &per_device_state);
+
+} // namespace FlexFlow::Kernels::ElementUnary
 
 #endif
diff --git a/lib/kernels/include/kernels/element_unary_kernels_cpu.h b/lib/kernels/include/kernels/element_unary_kernels_cpu.h
new file mode 100644
index 0000000000..55a25411a0
--- /dev/null
+++ b/lib/kernels/include/kernels/element_unary_kernels_cpu.h
@@ -0,0 +1,22 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ELEMENT_UNARY_KERNELS_CPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ELEMENT_UNARY_KERNELS_CPU_H
+
+#include "kernels/accessor.h"
+#include "kernels/ff_handle.h"
+#include "op-attrs/ops/element_unary_attrs.dtg.h"
+
+namespace FlexFlow::Kernels::ElementUnary {
+
+void cpu_forward_kernel(ElementUnaryAttrs const &attrs,
+                        GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output);
+
+void cpu_backward_kernel(ElementUnaryAttrs const &attrs,
+                         GenericTensorAccessorR const &output,
+                         GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorR const &input,
+                         GenericTensorAccessorW const &input_grad);
+
+} // namespace FlexFlow::Kernels::ElementUnary
+
+#endif
diff --git a/lib/kernels/include/kernels/element_unary_kernels_gpu.h b/lib/kernels/include/kernels/element_unary_kernels_gpu.h
new file mode 100644
index 0000000000..be5eed0edc
--- /dev/null
+++ b/lib/kernels/include/kernels/element_unary_kernels_gpu.h
@@ -0,0 +1,36 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ELEMENT_UNARY_KERNELS_GPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ELEMENT_UNARY_KERNELS_GPU_H
+
+#include "kernels/accessor.h"
+#include "kernels/device.h"
+#include "kernels/element_unary_per_device_state.dtg.h"
+#include "kernels/ff_handle.h"
+#include "op-attrs/ops/element_unary_attrs.dtg.h"
+
+namespace FlexFlow::Kernels::ElementUnary {
+
+ElementUnaryPerDeviceState gpu_init_kernel(TensorShape const &input_shape,
+                                           TensorShape const &output_shape,
+                                           ElementUnaryAttrs const &attrs);
+
+void gpu_forward_kernel(ffStream_t stream,
+                        ElementUnaryPerDeviceState const &per_device_state,
+                        ElementUnaryAttrs const &attrs,
+                        PerDeviceFFHandle const &handle,
+                        GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output);
+
+void gpu_backward_kernel(ffStream_t stream,
+                         ElementUnaryPerDeviceState const &per_device_state,
+                         ElementUnaryAttrs const &attrs,
+                         PerDeviceFFHandle const &handle,
+                         GenericTensorAccessorR const &output,
+                         GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorR const &input,
+                         GenericTensorAccessorW const &input_grad);
+
+void gpu_cleanup_kernel(ElementUnaryPerDeviceState &per_device_state);
+
+} // namespace FlexFlow::Kernels::ElementUnary
+
+#endif
diff --git a/lib/kernels/include/kernels/element_unary_per_device_state.struct.toml b/lib/kernels/include/kernels/element_unary_per_device_state.struct.toml
new file mode 100644
index 0000000000..019df40315
--- /dev/null
+++ b/lib/kernels/include/kernels/element_unary_per_device_state.struct.toml
@@ -0,0 +1,19 @@
+namespace = "FlexFlow"
+name = "ElementUnaryPerDeviceState"
+features = []
+
+includes = [
+  "kernels/device.h",
+]
+
+[[fields]]
+name = "inputTensor"
+type = "ffTensorDescriptor_t"
+
+[[fields]]
+name = "outputTensor"
+type = "ffTensorDescriptor_t"
+
+[[fields]]
+name = "actiDesc"
+type = "ffActivationDescriptor_t"
diff --git a/lib/kernels/include/kernels/embedding_kernels.h b/lib/kernels/include/kernels/embedding_kernels.h
index f51a730314..e9c158598a 100644
--- a/lib/kernels/include/kernels/embedding_kernels.h
+++ b/lib/kernels/include/kernels/embedding_kernels.h
@@ -2,11 +2,12 @@
 #define _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H
 
 #include "kernels/accessor.h"
-#include "kernels/device.h"
-#include "op-attrs/ops/embedding.h"
+#include "kernels/device_stream_t.dtg.h"
+#include "op-attrs/ops/embedding_attrs.dtg.h"
 
 namespace FlexFlow::Kernels::Embedding {
-void forward_kernel(ffStream_t stream,
+
+void forward_kernel(device_stream_t const &stream,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorW const &output,
                     GenericTensorAccessorR const &weight,
@@ -16,7 +17,8 @@ void forward_kernel(ffStream_t stream,
                     int in_dim,
                     int out_dim,
                     int batch_size);
-void backward_kernel(ffStream_t stream,
+
+void backward_kernel(device_stream_t const &stream,
                      GenericTensorAccessorR const &output,
                      GenericTensorAccessorR const &input,
                      GenericTensorAccessorW const &weight_grad,
@@ -27,12 +29,6 @@ void backward_kernel(ffStream_t stream,
                      int out_dim,
                      int batch_size);
 
-void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p);
-void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p);
-
-template <typename TD>
-__global__ void rand_generate_int(TD *ptr, size_t size, TD p);
-
 } // namespace FlexFlow::Kernels::Embedding
 
 #endif // _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H
diff --git a/lib/kernels/include/kernels/embedding_kernels_cpu.h b/lib/kernels/include/kernels/embedding_kernels_cpu.h
new file mode 100644
index 0000000000..23e32589ae
--- /dev/null
+++ b/lib/kernels/include/kernels/embedding_kernels_cpu.h
@@ -0,0 +1,31 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_EMBEDDING_KERNELS_CPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_EMBEDDING_KERNELS_CPU_H
+
+#include "kernels/accessor.h"
+#include "op-attrs/ops/embedding_attrs.dtg.h"
+
+namespace FlexFlow::Kernels::Embedding {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output,
+                        GenericTensorAccessorR const &weight,
+                        DataType input_data_type,
+                        DataType output_data_type,
+                        std::optional<AggregateOp> aggr,
+                        int in_dim,
+                        int out_dim,
+                        int batch_size);
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output,
+                         GenericTensorAccessorR const &input,
+                         GenericTensorAccessorW const &weight_grad,
+                         DataType output_data_type,
+                         DataType input_data_type,
+                         std::optional<AggregateOp> aggr,
+                         int in_dim,
+                         int out_dim,
+                         int batch_size);
+
+} // namespace FlexFlow::Kernels::Embedding
+
+#endif
diff --git a/lib/kernels/include/kernels/embedding_kernels_gpu.h b/lib/kernels/include/kernels/embedding_kernels_gpu.h
new file mode 100644
index 0000000000..7eace3971b
--- /dev/null
+++ b/lib/kernels/include/kernels/embedding_kernels_gpu.h
@@ -0,0 +1,33 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_EMBEDDING_KERNELS_GPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_EMBEDDING_KERNELS_GPU_H
+
+#include "kernels/accessor.h"
+#include "kernels/device.h"
+#include "op-attrs/ops/embedding.h"
+
+namespace FlexFlow::Kernels::Embedding {
+
+void gpu_forward_kernel(ffStream_t stream,
+                        GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output,
+                        GenericTensorAccessorR const &weight,
+                        DataType input_data_type,
+                        DataType output_data_type,
+                        std::optional<AggregateOp> aggr,
+                        int in_dim,
+                        int out_dim,
+                        int batch_size);
+void gpu_backward_kernel(ffStream_t stream,
+                         GenericTensorAccessorR const &output,
+                         GenericTensorAccessorR const &input,
+                         GenericTensorAccessorW const &weight_grad,
+                         DataType output_data_type,
+                         DataType input_data_type,
+                         std::optional<AggregateOp> aggr,
+                         int in_dim,
+                         int out_dim,
+                         int batch_size);
+
+} // namespace FlexFlow::Kernels::Embedding
+
+#endif
diff --git a/lib/kernels/include/kernels/ff_handle.h b/lib/kernels/include/kernels/ff_handle.h
index 31b3296a98..36ed58d91d 100644
--- a/lib/kernels/include/kernels/ff_handle.h
+++ b/lib/kernels/include/kernels/ff_handle.h
@@ -1,16 +1,16 @@
-#ifndef _FLEXFLOW_KERNELS_FF_HANDLE_H
-#define _FLEXFLOW_KERNELS_FF_HANDLE_H
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FF_HANDLE_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FF_HANDLE_H
 
 #ifdef FF_USE_NCCL
 #include <nccl.h>
 #endif
 
 #include "kernels/device.h"
-#include "utils/visitable.h"
 
 namespace FlexFlow {
 
 struct PerDeviceFFHandle {
+public:
   ffHandle_t dnn;
   ffblasHandle_t blas;
 
@@ -23,23 +23,6 @@ struct PerDeviceFFHandle {
 #endif
 };
 
-#ifdef FF_USE_NCCL
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(PerDeviceFFHandle,
-                                             dnn,
-                                             blas,
-                                             workSpace,
-                                             workSpaceSize,
-                                             allowTensorOpMathConversion,
-                                             ncclComm);
-#else
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(PerDeviceFFHandle,
-                                             dnn,
-                                             blas,
-                                             workSpace,
-                                             workSpaceSize,
-                                             allowTensorOpMathConversion);
-#endif
-
 std::string format_as(PerDeviceFFHandle const &x);
 std::ostream &operator<<(std::ostream &s, PerDeviceFFHandle const &x);
 
diff --git a/lib/kernels/include/kernels/fill_tensor_accessor.h b/lib/kernels/include/kernels/fill_tensor_accessor.h
index b10345933f..0e3cfd0dd5 100644
--- a/lib/kernels/include/kernels/fill_tensor_accessor.h
+++ b/lib/kernels/include/kernels/fill_tensor_accessor.h
@@ -7,7 +7,7 @@
 
 namespace FlexFlow {
 
-void fill_tensor_accessor(GenericTensorAccessorW &, DataTypeValue val);
+void fill_with_zeros(GenericTensorAccessorW const &accessor);
 
 GenericTensorAccessorW create_accessor_w_filled_with(
     TensorShape const &shape, DataTypeValue val, Allocator const &allocator);
diff --git a/lib/kernels/include/kernels/flat_kernels.h b/lib/kernels/include/kernels/flat_kernels.h
index b2b1164f92..caf04ec125 100644
--- a/lib/kernels/include/kernels/flat_kernels.h
+++ b/lib/kernels/include/kernels/flat_kernels.h
@@ -2,16 +2,16 @@
 #define _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H
 
 #include "kernels/accessor.h"
-#include "kernels/device.h"
+#include "kernels/device_stream_t.dtg.h"
 
 namespace FlexFlow::Kernels::Flat {
 
-void forward_kernel(ffStream_t stream,
-                    GenericTensorAccessorR input,
+void forward_kernel(device_stream_t const &stream,
+                    GenericTensorAccessorR const &input,
                     float *output_ptr);
 
-void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorR input,
+void backward_kernel(device_stream_t const &stream,
+                     GenericTensorAccessorR const &input,
                      float const *output_grad_ptr,
                      float *input_grad_ptr);
 
diff --git a/lib/kernels/include/kernels/flat_kernels_cpu.h b/lib/kernels/include/kernels/flat_kernels_cpu.h
new file mode 100644
index 0000000000..2fe43b0927
--- /dev/null
+++ b/lib/kernels/include/kernels/flat_kernels_cpu.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FLAT_KERNELS_CPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FLAT_KERNELS_CPU_H
+
+#include "kernels/accessor.h"
+
+namespace FlexFlow::Kernels::Flat {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input, float *output_ptr);
+
+void cpu_backward_kernel(GenericTensorAccessorR const &input,
+                         float const *output_grad_ptr,
+                         float *input_grad_ptr);
+
+} // namespace FlexFlow::Kernels::Flat
+
+#endif
diff --git a/lib/kernels/include/kernels/flat_kernels_gpu.h b/lib/kernels/include/kernels/flat_kernels_gpu.h
new file mode 100644
index 0000000000..4e889c561c
--- /dev/null
+++ b/lib/kernels/include/kernels/flat_kernels_gpu.h
@@ -0,0 +1,20 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FLAT_KERNELS_GPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FLAT_KERNELS_GPU_H
+
+#include "kernels/accessor.h"
+#include "kernels/device.h"
+
+namespace FlexFlow::Kernels::Flat {
+
+void gpu_forward_kernel(ffStream_t stream,
+                        GenericTensorAccessorR const &input,
+                        float *output_ptr);
+
+void gpu_backward_kernel(ffStream_t stream,
+                         GenericTensorAccessorR const &input,
+                         float const *output_grad_ptr,
+                         float *input_grad_ptr);
+
+} // namespace FlexFlow::Kernels::Flat
+
+#endif
diff --git a/lib/kernels/include/kernels/gather_kernels.h b/lib/kernels/include/kernels/gather_kernels.h
index 8cbc7e457e..66c79ab7ac 100644
--- a/lib/kernels/include/kernels/gather_kernels.h
+++ b/lib/kernels/include/kernels/gather_kernels.h
@@ -1,35 +1,30 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_GATHER_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_GATHER_KERNELS_H
 
-#include "accessor.h"
-#include "kernels/device.h"
+#include "kernels/accessor.h"
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/device_stream_t.dtg.h"
+#include "kernels/gather_per_device_state.dtg.h"
 
-namespace FlexFlow {
+namespace FlexFlow::Kernels::Gather {
 
-struct GatherPerDeviceState {
-  PerDeviceFFHandle handle;
-  legion_dim_t legion_dim;
-};
+std::optional<GatherPerDeviceState> init_kernel(DeviceType device_type,
+                                                device_handle_t const &handle,
+                                                ff_dim_t dim);
 
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GatherPerDeviceState,
-                                             handle,
-                                             legion_dim);
-
-namespace Kernels::Gather {
-
-void forward_kernel(ffStream_t stream,
-                    GatherPerDeviceState const &per_device_state,
+void forward_kernel(device_stream_t const &stream,
+                    std::optional<GatherPerDeviceState> const &per_device_state,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorR const &index,
                     GenericTensorAccessorW const &output);
 
-void backward_kernel(ffStream_t stream,
-                     GatherPerDeviceState const &per_device_state,
-                     GenericTensorAccessorR const &output_grad,
-                     GenericTensorAccessorR const &index,
-                     GenericTensorAccessorW const &input_grad);
+void backward_kernel(
+    device_stream_t const &stream,
+    std::optional<GatherPerDeviceState> const &per_device_state,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &index,
+    GenericTensorAccessorW const &input_grad);
 
-} // namespace Kernels::Gather
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Gather
 
 #endif
diff --git a/lib/kernels/include/kernels/gather_kernels_cpu.h b/lib/kernels/include/kernels/gather_kernels_cpu.h
new file mode 100644
index 0000000000..74e8f35d9f
--- /dev/null
+++ b/lib/kernels/include/kernels/gather_kernels_cpu.h
@@ -0,0 +1,18 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_GATHER_KERNELS_CPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_GATHER_KERNELS_CPU_H
+
+#include "kernels/accessor.h"
+
+namespace FlexFlow::Kernels::Gather {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorR const &index,
+                        GenericTensorAccessorW const &output);
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorR const &index,
+                         GenericTensorAccessorW const &input_grad);
+
+} // namespace FlexFlow::Kernels::Gather
+
+#endif
diff --git a/lib/kernels/include/kernels/gather_kernels_gpu.h b/lib/kernels/include/kernels/gather_kernels_gpu.h
new file mode 100644
index 0000000000..da0866dbca
--- /dev/null
+++ b/lib/kernels/include/kernels/gather_kernels_gpu.h
@@ -0,0 +1,27 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_GATHER_KERNELS_GPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_GATHER_KERNELS_GPU_H
+
+#include "kernels/accessor.h"
+#include "kernels/device.h"
+#include "kernels/gather_per_device_state.dtg.h"
+
+namespace FlexFlow::Kernels::Gather {
+
+GatherPerDeviceState gpu_init_kernel(PerDeviceFFHandle const &handle,
+                                     ff_dim_t dim);
+
+void gpu_forward_kernel(ffStream_t stream,
+                        GatherPerDeviceState const &per_device_state,
+                        GenericTensorAccessorR const &input,
+                        GenericTensorAccessorR const &index,
+                        GenericTensorAccessorW const &output);
+
+void gpu_backward_kernel(ffStream_t stream,
+                         GatherPerDeviceState const &per_device_state,
+                         GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorR const &index,
+                         GenericTensorAccessorW const &input_grad);
+
+} // namespace FlexFlow::Kernels::Gather
+
+#endif
diff --git a/lib/kernels/include/kernels/gather_per_device_state.struct.toml b/lib/kernels/include/kernels/gather_per_device_state.struct.toml
new file mode 100644
index 0000000000..c5163f0ddc
--- /dev/null
+++ b/lib/kernels/include/kernels/gather_per_device_state.struct.toml
@@ -0,0 +1,16 @@
+namespace = "FlexFlow"
+name = "GatherPerDeviceState"
+features = []
+
+includes = [
+  "kernels/ff_handle.h",
+  "op-attrs/ff_dim_t.dtg.h",
+]
+
+[[fields]]
+name = "handle"
+type = "::FlexFlow::PerDeviceFFHandle"
+
+[[fields]]
+name = "dim"
+type = "::FlexFlow::ff_dim_t"
diff --git a/lib/kernels/include/kernels/layer_norm_kernels.h b/lib/kernels/include/kernels/layer_norm_kernels.h
index 10cf2fb14b..7d59e323ba 100644
--- a/lib/kernels/include/kernels/layer_norm_kernels.h
+++ b/lib/kernels/include/kernels/layer_norm_kernels.h
@@ -2,61 +2,44 @@
 #define _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H
 
 #include "kernels/allocation.h"
-#include "kernels/device.h"
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/device_stream_t.dtg.h"
 #include "kernels/ff_handle.h"
-
-namespace FlexFlow {
-
-struct LayerNormPerDeviceState {
-  PerDeviceFFHandle handle;
-  bool elementwise_affine;
-  int64_t effective_batch_size, effective_num_elements;
-  float eps;
-  float *mean, *rstd, *ds, *db, *scale, *bias;
-  DataType data_type;
-};
-
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LayerNormPerDeviceState,
-                                             handle,
-                                             elementwise_affine,
-                                             effective_batch_size,
-                                             effective_num_elements,
-                                             eps,
-                                             mean,
-                                             rstd,
-                                             ds,
-                                             db,
-                                             scale,
-                                             bias,
-                                             data_type);
-
-namespace Kernels::LayerNorm {
-
-// todo: this may have some problem.
-LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
-                                    Allocator &allocator,
-                                    bool elementwise_affine,
-                                    int64_t effective_batch_size,
-                                    int64_t effective_num_elements,
-                                    float eps);
-
-void forward_kernel(ffStream_t stream,
-                    LayerNormPerDeviceState const &m,
-                    GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output,
-                    GenericTensorAccessorW const &gamma,
-                    GenericTensorAccessorW const &beta);
-
-void backward_kernel(ffStream_t stream,
-                     LayerNormPerDeviceState const &m,
-                     GenericTensorAccessorR const &output_grad,
-                     GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &input_grad,
-                     GenericTensorAccessorR const &gamma,
-                     GenericTensorAccessorW const &gamma_grad,
-                     GenericTensorAccessorW const &beta_grad);
-
-} // namespace Kernels::LayerNorm
-} // namespace FlexFlow
+#include "kernels/layer_norm_per_device_state.dtg.h"
+
+namespace FlexFlow::Kernels::LayerNorm {
+
+std::optional<LayerNormPerDeviceState>
+    init_kernel(DeviceType device_type,
+                device_handle_t const &handle,
+                Allocator &allocator,
+                bool elementwise_affine,
+                int64_t effective_batch_size,
+                int64_t effective_num_elements,
+                float eps);
+
+void forward_kernel(
+    device_stream_t const &stream,
+    std::optional<LayerNormPerDeviceState> const &per_device_state,
+    GenericTensorAccessorR const &input,
+    GenericTensorAccessorW const &output,
+    GenericTensorAccessorW const &gamma,
+    GenericTensorAccessorW const &beta);
+
+void backward_kernel(
+    device_stream_t const &stream,
+    std::optional<LayerNormPerDeviceState> const &per_device_state,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &input,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorR const &gamma,
+    GenericTensorAccessorW const &gamma_grad,
+    GenericTensorAccessorW const &beta_grad);
+
+void cleanup_kernel(
+    DeviceType device_type,
+    std::optional<LayerNormPerDeviceState> const &per_device_state);
+
+} // namespace FlexFlow::Kernels::LayerNorm
 
 #endif // _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H
diff --git a/lib/kernels/include/kernels/layer_norm_kernels_cpu.h b/lib/kernels/include/kernels/layer_norm_kernels_cpu.h
new file mode 100644
index 0000000000..74239a36eb
--- /dev/null
+++ b/lib/kernels/include/kernels/layer_norm_kernels_cpu.h
@@ -0,0 +1,22 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LAYER_NORM_KERNELS_CPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LAYER_NORM_KERNELS_CPU_H
+
+#include "kernels/accessor.h"
+
+namespace FlexFlow::Kernels::LayerNorm {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output,
+                        GenericTensorAccessorW const &gamma,
+                        GenericTensorAccessorW const &beta);
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorR const &input,
+                         GenericTensorAccessorW const &input_grad,
+                         GenericTensorAccessorR const &gamma,
+                         GenericTensorAccessorW const &gamma_grad,
+                         GenericTensorAccessorW const &beta_grad);
+
+} // namespace FlexFlow::Kernels::LayerNorm
+
+#endif
diff --git a/lib/kernels/include/kernels/layer_norm_kernels_gpu.h b/lib/kernels/include/kernels/layer_norm_kernels_gpu.h
new file mode 100644
index 0000000000..ccf6d3027c
--- /dev/null
+++ b/lib/kernels/include/kernels/layer_norm_kernels_gpu.h
@@ -0,0 +1,39 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LAYER_NORM_KERNELS_GPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LAYER_NORM_KERNELS_GPU_H
+
+#include "kernels/allocation.h"
+#include "kernels/device.h"
+#include "kernels/ff_handle.h"
+#include "kernels/layer_norm_per_device_state.dtg.h"
+
+namespace FlexFlow::Kernels::LayerNorm {
+
+// todo: this may have some problem.
+LayerNormPerDeviceState gpu_init_kernel(PerDeviceFFHandle const &handle,
+                                        Allocator &allocator,
+                                        bool elementwise_affine,
+                                        int64_t effective_batch_size,
+                                        int64_t effective_num_elements,
+                                        float eps);
+
+void gpu_forward_kernel(ffStream_t stream,
+                        LayerNormPerDeviceState const &per_device_state,
+                        GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output,
+                        GenericTensorAccessorW const &gamma,
+                        GenericTensorAccessorW const &beta);
+
+void gpu_backward_kernel(ffStream_t stream,
+                         LayerNormPerDeviceState const &per_device_state,
+                         GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorR const &input,
+                         GenericTensorAccessorW const &input_grad,
+                         GenericTensorAccessorR const &gamma,
+                         GenericTensorAccessorW const &gamma_grad,
+                         GenericTensorAccessorW const &beta_grad);
+
+void gpu_cleanup_kernel(LayerNormPerDeviceState const &per_device_state);
+
+} // namespace FlexFlow::Kernels::LayerNorm
+
+#endif
diff --git a/lib/kernels/include/kernels/layer_norm_per_device_state.struct.toml b/lib/kernels/include/kernels/layer_norm_per_device_state.struct.toml
new file mode 100644
index 0000000000..0a482d5395
--- /dev/null
+++ b/lib/kernels/include/kernels/layer_norm_per_device_state.struct.toml
@@ -0,0 +1,57 @@
+namespace = "FlexFlow"
+name = "LayerNormPerDeviceState"
+features = []
+
+includes = [
+  "kernels/ff_handle.h",
+  "op-attrs/datatype.dtg.h",
+]
+
+[[fields]]
+name = "handle"
+type = "::FlexFlow::PerDeviceFFHandle"
+
+[[fields]]
+name = "elementwise_affine"
+type = "bool"
+
+[[fields]]
+name = "effective_num_elements"
+type = "int64_t"
+
+[[fields]]
+name = "effective_batch_size"
+type = "int64_t"
+
+[[fields]]
+name = "eps"
+type = "float"
+
+[[fields]]
+name = "mean"
+type = "float *"
+
+[[fields]]
+name = "rstd"
+type = "float *"
+
+[[fields]]
+name = "ds"
+type = "float *"
+
+[[fields]]
+name = "db"
+type = "float *"
+
+[[fields]]
+name = "scale"
+type = "float *"
+
+[[fields]]
+name = "bias"
+type = "float *"
+
+[[fields]]
+name = "data_type"
+type = "::FlexFlow::DataType"
+
diff --git a/lib/kernels/include/kernels/legion_dim.h b/lib/kernels/include/kernels/legion_dim.h
index 796423102b..24eff46e22 100644
--- a/lib/kernels/include/kernels/legion_dim.h
+++ b/lib/kernels/include/kernels/legion_dim.h
@@ -5,6 +5,7 @@
 #include "kernels/legion_ordered/legion_ordered.h"
 #include "op-attrs/ff_dim_t.dtg.h"
 #include "op-attrs/ff_ordered/ff_ordered.h"
+#include "op-attrs/tensor_dims.dtg.h"
 #include "utils/containers/set_of.h"
 #include "utils/containers/transform.h"
 #include "utils/nonnegative_int/nonnegative_range.h"
@@ -13,6 +14,9 @@
 
 namespace FlexFlow {
 
+positive_int dim_at_idx(TensorDims const &, legion_dim_t);
+positive_int &dim_at_idx(TensorDims &, legion_dim_t);
+
 legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value);
 
 legion_dim_t legion_dim_from_ff_dim(ff_dim_t, nonnegative_int num_dimensions);
diff --git a/lib/kernels/include/kernels/legion_ordered/legion_ordered.h b/lib/kernels/include/kernels/legion_ordered/legion_ordered.h
index ad8b3bad6d..87836fb31e 100644
--- a/lib/kernels/include/kernels/legion_ordered/legion_ordered.h
+++ b/lib/kernels/include/kernels/legion_ordered/legion_ordered.h
@@ -11,17 +11,17 @@ template <typename T>
 struct LegionOrdered {
   LegionOrdered() {}
 
-  LegionOrdered(std::initializer_list<T> const &l)
+  explicit LegionOrdered(std::initializer_list<T> const &l)
       : contents(l.begin(), l.end()) {}
 
-  LegionOrdered(std::vector<T> const &contents)
+  explicit LegionOrdered(std::vector<T> const &contents)
       : contents(contents.begin(), contents.end()) {}
 
   template <typename It>
-  LegionOrdered(It begin, It end) : contents(begin, end) {}
+  explicit LegionOrdered(It begin, It end) : contents(begin, end) {}
 
   template <size_t MAXSIZE>
-  LegionOrdered(stack_vector<T, MAXSIZE> const &contents)
+  explicit LegionOrdered(stack_vector<T, MAXSIZE> const &contents)
       : contents(contents.begin(), contents.end()) {}
 
   T const &at(legion_dim_t idx) const {
diff --git a/lib/kernels/include/kernels/linear_kernels.h b/lib/kernels/include/kernels/linear_kernels.h
index 21d84c2567..0b6371c766 100644
--- a/lib/kernels/include/kernels/linear_kernels.h
+++ b/lib/kernels/include/kernels/linear_kernels.h
@@ -1,77 +1,54 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H
 
-#include "ff_handle.h"
-#include "kernels/device.h"
+#include "kernels/accessor.h"
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/device_stream_t.dtg.h"
+#include "kernels/ff_handle.h"
+#include "kernels/linear_per_device_state.dtg.h"
 #include "op-attrs/datatype.h"
 #include "op-attrs/ops/linear_attrs.dtg.h"
+#include "pcg/device_type.dtg.h"
 
 namespace FlexFlow {
 
-struct LinearPerDeviceState {
-  PerDeviceFFHandle handle;
-  ffTensorDescriptor_t outputTensor;
-  ffActivationDescriptor_t actiDesc;
-  float const *one_ptr; // how to handle this?
-  cudnnActivationMode_t activation_mode;
-  std::optional<Activation> activation;
-  std::optional<RegularizerAttrs> regularizer;
-  bool use_bias;
-  DataType input_type, weight_type, output_type;
-};
+std::optional<LinearPerDeviceState>
+    linear_init_kernel(DeviceType device_type,
+                       device_handle_t const &handle,
+                       std::optional<Activation> activation,
+                       std::optional<RegularizerAttrs> regularizer,
+                       bool use_bias,
+                       DataType input_type,
+                       DataType weight_type,
+                       DataType output_type,
+                       int batch_size,
+                       int output_num_channels);
+
+void linear_forward_kernel(
+    device_stream_t const &stream,
+    std::optional<LinearPerDeviceState> const &per_device_state,
+    LinearAttrs const &attrs,
+    GenericTensorAccessorR const &input_accessor,
+    GenericTensorAccessorW const &output_accessor,
+    GenericTensorAccessorR const &projection_accessor,
+    std::optional<GenericTensorAccessorR> const &bias_accessor);
+
+void linear_backward_kernel(
+    device_stream_t const &stream,
+    std::optional<LinearPerDeviceState> const &per_device_state,
+    LinearAttrs const &attrs,
+    GenericTensorAccessorR const &output,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &input,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorR const &projection,
+    GenericTensorAccessorW const &projection_grad,
+    std::optional<GenericTensorAccessorW> const &bias_grad);
+
+void linear_cleanup_kernel(
+    DeviceType device_type,
+    std::optional<LinearPerDeviceState> &per_device_state);
 
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LinearPerDeviceState,
-                                             handle,
-                                             outputTensor,
-                                             actiDesc,
-                                             one_ptr,
-                                             activation_mode,
-                                             activation,
-                                             regularizer,
-                                             use_bias,
-                                             input_type,
-                                             weight_type,
-                                             output_type);
-
-namespace Kernels::Linear {
-
-LinearPerDeviceState init_kernel(PerDeviceFFHandle handle,
-                                 float *one_ptr,
-                                 std::optional<Activation> activation,
-                                 std::optional<RegularizerAttrs> regularizer,
-                                 bool use_bias,
-                                 DataType input_type,
-                                 DataType weight_type,
-                                 DataType output_type,
-                                 int batch_size,
-                                 int channel);
-
-bool use_activation(Activation activation);
-
-void forward_kernel(ffStream_t stream,
-                    LinearPerDeviceState const &m,
-                    float const *input_ptr,
-                    float *output_ptr,
-                    float const *filter_ptr,
-                    float const *bias_ptr,
-                    int in_dim,
-                    int out_dim,
-                    int batch_size);
-
-void backward_kernel(ffStream_t stream,
-                     LinearPerDeviceState const &m,
-                     float const *output_ptr,
-                     float *output_grad_ptr,
-                     float const *input_ptr,
-                     float *input_grad_ptr,
-                     float const *kernel_ptr,
-                     float *kernel_grad_ptr,
-                     float *bias_grad_ptr,
-                     int in_dim,
-                     int out_dim,
-                     int batch_size);
-
-} // namespace Kernels::Linear
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/kernels/include/kernels/linear_kernels_cpu.h b/lib/kernels/include/kernels/linear_kernels_cpu.h
new file mode 100644
index 0000000000..4621f38d7f
--- /dev/null
+++ b/lib/kernels/include/kernels/linear_kernels_cpu.h
@@ -0,0 +1,29 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LINEAR_KERNELS_CPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LINEAR_KERNELS_CPU_H
+
+#include "kernels/accessor.h"
+#include "op-attrs/ops/linear_attrs.dtg.h"
+#include <optional>
+
+namespace FlexFlow {
+
+void linear_cpu_forward_kernel(
+    LinearAttrs const &attrs,
+    GenericTensorAccessorR const &input,
+    GenericTensorAccessorW const &output,
+    GenericTensorAccessorR const &projection,
+    std::optional<GenericTensorAccessorR> const &bias);
+
+void linear_cpu_backward_kernel(
+    LinearAttrs const &attrs,
+    GenericTensorAccessorR const &output,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &input,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorR const &projection,
+    GenericTensorAccessorW const &projection_grad,
+    std::optional<GenericTensorAccessorW> const &bias_grad);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/linear_kernels_gpu.h b/lib/kernels/include/kernels/linear_kernels_gpu.h
new file mode 100644
index 0000000000..02fac75c25
--- /dev/null
+++ b/lib/kernels/include/kernels/linear_kernels_gpu.h
@@ -0,0 +1,49 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LINEAR_KERNELS_GPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LINEAR_KERNELS_GPU_H
+
+#include "kernels/device.h"
+#include "kernels/ff_handle.h"
+#include "kernels/linear_per_device_state.dtg.h"
+#include "pcg/device_type.dtg.h"
+
+namespace FlexFlow::Kernels::Linear {
+
+LinearPerDeviceState
+    gpu_init_kernel(PerDeviceFFHandle handle,
+                    std::optional<Activation> activation,
+                    std::optional<RegularizerAttrs> regularizer,
+                    bool use_bias,
+                    DataType input_type,
+                    DataType weight_type,
+                    DataType output_type,
+                    int batch_size,
+                    int output_num_channels);
+
+void gpu_forward_kernel(ffStream_t stream,
+                        LinearPerDeviceState const &per_device_state,
+                        float const *input_ptr,
+                        float *output_ptr,
+                        float const *filter_ptr,
+                        float const *bias_ptr,
+                        int in_dim,
+                        int out_dim,
+                        int batch_size);
+
+void gpu_backward_kernel(ffStream_t stream,
+                         LinearPerDeviceState const &per_device_state,
+                         float const *output_ptr,
+                         float *output_grad_ptr,
+                         float const *input_ptr,
+                         float *input_grad_ptr,
+                         float const *kernel_ptr,
+                         float *kernel_grad_ptr,
+                         float *bias_grad_ptr,
+                         int in_dim,
+                         int out_dim,
+                         int batch_size);
+
+void gpu_cleanup_kernel(LinearPerDeviceState &per_device_state);
+
+} // namespace FlexFlow::Kernels::Linear
+
+#endif
diff --git a/lib/kernels/include/kernels/linear_per_device_state.struct.toml b/lib/kernels/include/kernels/linear_per_device_state.struct.toml
new file mode 100644
index 0000000000..3ed534a23f
--- /dev/null
+++ b/lib/kernels/include/kernels/linear_per_device_state.struct.toml
@@ -0,0 +1,56 @@
+namespace = "FlexFlow"
+name = "LinearPerDeviceState"
+features = []
+
+includes = [
+  "kernels/ff_handle.h",
+  "kernels/device.h",
+  "<optional>",
+  "op-attrs/activation.dtg.h",
+  "op-attrs/regularizer_attrs.dtg.h",
+  "op-attrs/datatype.dtg.h",
+]
+
+[[fields]]
+name = "handle"
+type = "::FlexFlow::PerDeviceFFHandle"
+
+[[fields]]
+name = "outputTensor"
+type = "ffTensorDescriptor_t"
+
+[[fields]]
+name = "actiDesc"
+type = "ffActivationDescriptor_t"
+
+[[fields]]
+name = "one_ptr"
+type = "float const *"
+
+[[fields]]
+name = "activation_mode"
+type = "cudnnActivationMode_t"
+
+[[fields]]
+name = "activation"
+type = "std::optional<::FlexFlow::Activation>"
+
+[[fields]]
+name = "regularizer"
+type = "std::optional<::FlexFlow::RegularizerAttrs>"
+
+[[fields]]
+name = "use_bias"
+type = "bool"
+
+[[fields]]
+name = "input_type"
+type = "::FlexFlow::DataType"
+
+[[fields]]
+name = "weight_type"
+type = "::FlexFlow::DataType"
+
+[[fields]]
+name = "output_type"
+type = "::FlexFlow::DataType"
diff --git a/lib/kernels/include/kernels/loss_function_kernels.h b/lib/kernels/include/kernels/loss_function_kernels.h
index bab404f884..092d3691f5 100644
--- a/lib/kernels/include/kernels/loss_function_kernels.h
+++ b/lib/kernels/include/kernels/loss_function_kernels.h
@@ -1,12 +1,13 @@
 #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_H
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_H
 
-#include "kernels/device.h"
+#include "kernels/accessor.h"
+#include "kernels/device_stream_t.dtg.h"
 
 namespace FlexFlow {
 
 void sparse_categorical_crossentropy_loss_backward_kernel(
-    ffStream_t stream,
+    device_stream_t const &stream,
     float *logit_grad_ptr,
     float const *logit_ptr,
     int const *label_ptr,
@@ -16,21 +17,23 @@ void sparse_categorical_crossentropy_loss_backward_kernel(
     int num_classes,
     int k,
     float scale_factor);
-void categorical_crossentropy_loss_backward_kernel(ffStream_t stream,
-                                                   float *logit_grad_ptr,
-                                                   float const *logit_ptr,
-                                                   float const *label_ptr,
-                                                   size_t logit_volume,
-                                                   size_t logit_grad_volume,
-                                                   float scale_factor);
-void mean_squared_error_avg_loss_backward_kernel(ffStream_t stream,
+
+void categorical_crossentropy_loss_backward_kernel(
+    device_stream_t const &stream,
+    GenericTensorAccessorW const &logit_grad,
+    GenericTensorAccessorR const &logit,
+    GenericTensorAccessorR const &label,
+    float scale_factor);
+
+void mean_squared_error_avg_loss_backward_kernel(device_stream_t const &stream,
                                                  float *logit_grad_ptr,
                                                  float const *logit_ptr,
                                                  float const *label_ptr,
                                                  size_t logit_volume,
                                                  size_t logit_grad_volume,
                                                  float scale_factor);
-void identity_loss_backward_kernel(ffStream_t stream,
+
+void identity_loss_backward_kernel(device_stream_t const &stream,
                                    float *loss_grad_ptr,
                                    float const *loss_ptr,
                                    size_t loss_volume,
diff --git a/lib/kernels/include/kernels/loss_function_kernels_cpu.h b/lib/kernels/include/kernels/loss_function_kernels_cpu.h
new file mode 100644
index 0000000000..b6abd01ab3
--- /dev/null
+++ b/lib/kernels/include/kernels/loss_function_kernels_cpu.h
@@ -0,0 +1,41 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_CPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_CPU_H
+
+#include "kernels/accessor.h"
+#include <cstddef>
+
+namespace FlexFlow {
+
+void sparse_categorical_crossentropy_loss_backward_cpu_kernel(
+    float *logit_grad_ptr,
+    float const *logit_ptr,
+    int const *label_ptr,
+    size_t logit_volume,
+    size_t logit_grad_volume,
+    int num_samples,
+    int num_classes,
+    int k,
+    float scale_factor);
+
+void categorical_crossentropy_loss_backward_cpu_kernel(
+    GenericTensorAccessorW const &logit_grad_ptr,
+    GenericTensorAccessorR const &logit_ptr,
+    GenericTensorAccessorR const &label_ptr,
+    float scale_factor);
+
+void mean_squared_error_avg_loss_backward_cpu_kernel(float *logit_grad_ptr,
+                                                     float const *logit_ptr,
+                                                     float const *label_ptr,
+                                                     size_t logit_volume,
+                                                     size_t logit_grad_volume,
+                                                     float scale_factor);
+
+void identity_loss_backward_cpu_kernel(float *loss_grad_ptr,
+                                       float const *loss_ptr,
+                                       size_t loss_volume,
+                                       size_t loss_grad_volume,
+                                       float csale_factor);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/loss_function_kernels_gpu.h b/lib/kernels/include/kernels/loss_function_kernels_gpu.h
new file mode 100644
index 0000000000..7bda92531f
--- /dev/null
+++ b/lib/kernels/include/kernels/loss_function_kernels_gpu.h
@@ -0,0 +1,45 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_GPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_GPU_H
+
+#include "kernels/device.h"
+
+namespace FlexFlow {
+
+void sparse_categorical_crossentropy_loss_backward_gpu_kernel(
+    ffStream_t stream,
+    float *logit_grad_ptr,
+    float const *logit_ptr,
+    int const *label_ptr,
+    size_t logit_volume,
+    size_t logit_grad_volume,
+    int num_samples,
+    int num_classes,
+    int k,
+    float scale_factor);
+
+void categorical_crossentropy_loss_backward_gpu_kernel(ffStream_t stream,
+                                                       float *logit_grad_ptr,
+                                                       float const *logit_ptr,
+                                                       float const *label_ptr,
+                                                       size_t logit_volume,
+                                                       size_t logit_grad_volume,
+                                                       float scale_factor);
+
+void mean_squared_error_avg_loss_backward_gpu_kernel(ffStream_t stream,
+                                                     float *logit_grad_ptr,
+                                                     float const *logit_ptr,
+                                                     float const *label_ptr,
+                                                     size_t logit_volume,
+                                                     size_t logit_grad_volume,
+                                                     float scale_factor);
+
+void identity_loss_backward_gpu_kernel(ffStream_t stream,
+                                       float *loss_grad_ptr,
+                                       float const *loss_ptr,
+                                       size_t loss_volume,
+                                       size_t loss_grad_volume,
+                                       float csale_factor);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h
index 287369a202..e80e3e4b15 100644
--- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h
+++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h
@@ -2,6 +2,8 @@
 #define _FLEXFLOW_KERNELS_MANAGED_HANDLE_H
 
 #include "kernels/ff_handle.h"
+#include "pcg/device_type.dtg.h"
+#include <optional>
 
 namespace FlexFlow {
 
@@ -33,6 +35,11 @@ struct ManagedPerDeviceFFHandle {
   PerDeviceFFHandle *handle;
 };
 
+std::optional<ManagedPerDeviceFFHandle>
+    create_local_handle_for_device_type(DeviceType device_type,
+                                        size_t workSpaceSize,
+                                        bool allowTensorOpMathConversion);
+
 ManagedPerDeviceFFHandle
     initialize_single_gpu_handle(size_t workSpaceSize,
                                  bool allowTensorOpMathConversion);
diff --git a/lib/kernels/include/kernels/map_tensor_accessors.h b/lib/kernels/include/kernels/map_tensor_accessors.h
index 2933a611cf..f7aa6a1001 100644
--- a/lib/kernels/include/kernels/map_tensor_accessors.h
+++ b/lib/kernels/include/kernels/map_tensor_accessors.h
@@ -15,57 +15,78 @@ namespace FlexFlow {
 template <DataType DT>
 struct CPUMapTensorAccessorInPlace {
   template <typename F>
-  void operator()(GenericTensorAccessorW &accessor, F &&f) {
+  void operator()(GenericTensorAccessorW const &accessor, F &&f) {
     ASSERT(accessor.device_type == DeviceType::CPU);
 
-    for (ArrayCoord const &coord : get_array_coord_set(accessor.shape)) {
-      accessor.at<DT>(coord.ff_ordered) = f(accessor.at<DT>(coord.ff_ordered));
+    for (TensorDimsCoord const &coord :
+         get_tensor_dims_coord_set(accessor.shape.dims)) {
+      accessor.at<DT>(coord) = f(accessor.at<DT>(coord));
     }
   }
 };
 
 template <typename F>
-void map_tensor_accessor_inplace(GenericTensorAccessorW &accessor, F &&f) {
+void map_tensor_accessor_inplace(GenericTensorAccessorW const &accessor,
+                                 F &&f) {
   ASSERT(accessor.device_type == DeviceType::CPU);
 
   DataTypeDispatch1<CPUMapTensorAccessorInPlace>{}(
-      accessor.data_type, accessor, f);
+      accessor.shape.data_type, accessor, f);
 }
 
 template <DataType DT>
 struct CPUMapTensorAccessor {
   template <typename F>
   void operator()(GenericTensorAccessorR const &input,
-                  GenericTensorAccessorW &output,
+                  GenericTensorAccessorW const &output,
                   F &&f) {
-    ArrayShape shape = require_same(input.shape, output.shape);
+    TensorDims tensor_dims = require_same(input.shape.dims, output.shape.dims);
 
     ASSERT(input.device_type == DeviceType::CPU);
     ASSERT(output.device_type == DeviceType::CPU);
 
-    for (ArrayCoord const &coord : get_array_coord_set(shape)) {
+    for (TensorDimsCoord const &coord :
+         get_tensor_dims_coord_set(tensor_dims)) {
       output.at<
           type_to_data_type_enum_v<std::invoke_result_t<F, real_type_t<DT>>>>(
-          coord.ff_ordered) = f(input.at<DT>(coord.ff_ordered));
+          coord) = f(input.at<DT>(coord));
     }
   }
 };
 
 template <typename F, typename Out = std::invoke_result_t<F, float>>
-GenericTensorAccessorW map_tensor_accessor(GenericTensorAccessorR const &input,
-                                           F &&f,
-                                           Allocator &output_allocator) {
+void map_tensor_accessor_to(GenericTensorAccessorR const &input,
+                            F &&f,
+                            GenericTensorAccessorW const &output) {
   Allocator cpu_allocator = create_local_cpu_memory_allocator();
   GenericTensorAccessorR input_cpu =
       copy_tensor_accessor_r_to_cpu_if_necessary(input, cpu_allocator);
 
-  GenericTensorAccessorW output_cpu = cpu_allocator.allocate_tensor(
-      get_tensor_shape(input.shape, type_to_data_type_enum_v<Out>));
+  GenericTensorAccessorW output_cpu =
+      cpu_allocator.allocate_tensor(output.shape);
 
   DataTypeDispatch1<CPUMapTensorAccessor>{}(
-      input.data_type, input_cpu, output_cpu, f);
+      input.shape.data_type, input_cpu, output_cpu, f);
 
-  return copy_tensor_accessor_w(output_cpu, output_allocator);
+  copy_accessor_data_to_l_from_r(
+      output, read_only_accessor_from_write_accessor(output_cpu));
+}
+
+template <typename F, typename Out = std::invoke_result_t<F, float>>
+GenericTensorAccessorW map_tensor_accessor(GenericTensorAccessorR const &input,
+                                           F &&f,
+                                           Allocator &output_allocator) {
+  TensorShape output_shape = TensorShape{
+      /*dims=*/input.shape.dims,
+      /*data_type=*/type_to_data_type_enum_v<Out>,
+  };
+
+  GenericTensorAccessorW output =
+      output_allocator.allocate_tensor(output_shape);
+
+  map_tensor_accessor_to(input, f, output);
+
+  return output;
 }
 
 template <DataType DTL, DataType DTR>
@@ -78,30 +99,30 @@ struct CPUMapTensorAccessors2 {
                   GenericTensorAccessorW &output,
                   F &&f) {
 
-    ArrayShape shape = throw_if_unexpected(require_all_same1(std::vector{
-        lhs.shape,
-        rhs.shape,
-        output.shape,
+    TensorDims dims = throw_if_unexpected(require_all_same1(std::vector{
+        lhs.shape.dims,
+        rhs.shape.dims,
+        output.shape.dims,
     }));
 
     ASSERT(lhs.device_type == DeviceType::CPU);
     ASSERT(rhs.device_type == DeviceType::CPU);
     ASSERT(output.device_type == DeviceType::CPU);
 
-    for (ArrayCoord const &coord : get_array_coord_set(shape)) {
-      output.at<type_to_data_type_enum_v<Out>>(coord.ff_ordered) =
-          f(lhs.at<DTL>(coord.ff_ordered), rhs.at<DTR>(coord.ff_ordered));
+    for (TensorDimsCoord const &coord : get_tensor_dims_coord_set(dims)) {
+      output.at<type_to_data_type_enum_v<Out>>(coord) =
+          f(lhs.at<DTL>(coord), rhs.at<DTR>(coord));
     }
   }
 };
 
 template <typename F>
-GenericTensorAccessorW map_tensor_accessors2(GenericTensorAccessorR const &lhs,
-                                             GenericTensorAccessorR const &rhs,
-                                             DataType output_data_type,
-                                             F &&f,
-                                             Allocator &output_allocator) {
-  ArrayShape shape = require_same(lhs.shape, rhs.shape);
+void map_tensor_accessors2_to(GenericTensorAccessorR const &lhs,
+                              GenericTensorAccessorR const &rhs,
+                              DataType output_data_type,
+                              F &&f,
+                              GenericTensorAccessorW const &output) {
+  TensorDims output_dims = require_same(lhs.shape.dims, rhs.shape.dims);
 
   Allocator cpu_allocator = create_local_cpu_memory_allocator();
   GenericTensorAccessorR lhs_cpu =
@@ -109,12 +130,32 @@ GenericTensorAccessorW map_tensor_accessors2(GenericTensorAccessorR const &lhs,
   GenericTensorAccessorR rhs_cpu =
       copy_tensor_accessor_r_to_cpu_if_necessary(rhs, cpu_allocator);
   GenericTensorAccessorW output_cpu =
-      cpu_allocator.allocate_tensor(get_tensor_shape(shape, output_data_type));
+      cpu_allocator.allocate_tensor(TensorShape{output_dims, output_data_type});
+
+  DataTypeDispatch2<CPUMapTensorAccessors2>{}(lhs.shape.data_type,
+                                              rhs.shape.data_type,
+                                              lhs_cpu,
+                                              rhs_cpu,
+                                              output_cpu,
+                                              f);
+
+  return copy_accessor_data_to_l_from_r(output, output_cpu);
+}
+
+template <typename F>
+GenericTensorAccessorW map_tensor_accessors2(GenericTensorAccessorR const &lhs,
+                                             GenericTensorAccessorR const &rhs,
+                                             DataType output_data_type,
+                                             F &&f,
+                                             Allocator &output_allocator) {
+  TensorDims output_dims = require_same(lhs.shape.dims, rhs.shape.dims);
+
+  GenericTensorAccessorW output = output_allocator.allocate_tensor(
+      TensorShape{output_dims, output_data_type});
 
-  DataTypeDispatch2<CPUMapTensorAccessors2>{}(
-      lhs.data_type, rhs.data_type, lhs_cpu, rhs_cpu, output_cpu, f);
+  map_tensor_accessors2_to(lhs, rhs, output_data_type, f, output);
 
-  return copy_tensor_accessor_w(output_cpu, output_allocator);
+  return output;
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/include/kernels/mha_per_device_state.struct.toml b/lib/kernels/include/kernels/mha_per_device_state.struct.toml
new file mode 100644
index 0000000000..324e8d1184
--- /dev/null
+++ b/lib/kernels/include/kernels/mha_per_device_state.struct.toml
@@ -0,0 +1,65 @@
+namespace = "FlexFlow"
+name = "MHAPerDeviceState"
+features = []
+
+includes = [
+  "kernels/device.h",
+  "kernels/ff_handle.h",
+  "kernels/allocation.h",
+]
+
+[[fields]]
+name = "handle"
+type = "::FlexFlow::PerDeviceFFHandle"
+
+[[fields]]
+name = "weightSize"
+type = "size_t"
+
+[[fields]]
+name = "reserveSpaceSize"
+type = "size_t"
+
+[[fields]]
+name = "attnDesc"
+type = "ffAttnDescriptor_t"
+
+[[fields]]
+name = "qDesc"
+type = "ffSeqDataDescriptor_t"
+
+[[fields]]
+name = "kDesc"
+type = "ffSeqDataDescriptor_t"
+
+[[fields]]
+name = "vDesc"
+type = "ffSeqDataDescriptor_t"
+
+[[fields]]
+name = "oDesc"
+type = "ffSeqDataDescriptor_t"
+
+[[fields]]
+name = "devQoSeqArray"
+type = "int *"
+
+[[fields]]
+name = "devKvSeqArray"
+type = "int *"
+
+[[fields]]
+name = "loWinIdx"
+type = "int *"
+
+[[fields]]
+name = "hiWinIdx"
+type = "int *"
+
+[[fields]]
+name = "reserveSpace"
+type = "void *"
+
+[[fields]]
+name = "allocator"
+type = "::FlexFlow::Allocator"
diff --git a/lib/kernels/include/kernels/optimizer_kernels.h b/lib/kernels/include/kernels/optimizer_kernels.h
index 51e6f8640f..6bb7b913be 100644
--- a/lib/kernels/include/kernels/optimizer_kernels.h
+++ b/lib/kernels/include/kernels/optimizer_kernels.h
@@ -1,62 +1,37 @@
 #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H
 
-#include "kernels/device.h"
+#include "kernels/accessor.h"
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/device_stream_t.dtg.h"
 #include "kernels/ff_handle.h"
 
 namespace FlexFlow {
 
-void sgd_ps_update_task_gpu(ffStream_t,
-                            float lr,
-                            float momentum,
-                            bool nesterov,
-                            float weight_decay,
-                            float const *weight_grad_ptr,
-                            size_t size,
-                            int num_replicas,
-                            float *weight_ptr,
-                            float *sgd_v_ptr);
-
-#ifdef FF_USE_NCCL
-void sgd_nccl_update_task_gpu(ffStream_t,
-                              float lr,
-                              float momentum,
-                              bool nesterov,
-                              float weight_decay,
-                              PerDeviceFFHandle const &,
-                              float const *weight_grad_ptr,
-                              size_t size,
-                              float *weight_ptr,
-                              float *sgd_v_ptr);
-#endif
-
-void adam_ps_update_task_gpu(ffStream_t,
-                             float alpha_t,
-                             float beta1,
-                             float beta2,
-                             float weight_decay,
-                             float epsilon,
-                             float const *weight_grad_ptr,
-                             size_t size,
-                             int num_replicas,
-                             float *weight_ptr,
-                             float *adam_v_ptr,
-                             float *adam_m_ptr);
-
-#ifdef FF_USE_NCCL
-void adam_nccl_update_task_gpu(ffStream_t,
-                               float alpha_t,
-                               float beta1,
-                               float beta2,
-                               float weight_decay,
-                               float epsilon,
-                               PerDeviceFFHandle const &,
-                               float const *weight_grad_ptr,
-                               size_t size,
-                               float *weight_ptr,
-                               float *adam_v_ptr,
-                               float *adam_m_ptr);
-#endif
+void sgd_update_task(device_stream_t const &stream,
+                     device_handle_t const &handle,
+                     float lr,
+                     float momentum,
+                     bool nesterov,
+                     float weight_decay,
+                     GenericTensorAccessorR const &weight_grad,
+                     int num_replicas,
+                     GenericTensorAccessorW const &weight,
+                     std::optional<GenericTensorAccessorW> const &sgd_v);
+
+void adam_update_task(device_stream_t const &stream,
+                      device_handle_t const &handle,
+                      float alpha_t,
+                      float beta1,
+                      float beta2,
+                      float weight_decay,
+                      float epsilon,
+                      float const *weight_grad_ptr,
+                      size_t size,
+                      int num_replicas,
+                      float *weight_ptr,
+                      float *adam_v_ptr,
+                      float *adam_m_ptr);
 
 } // namespace FlexFlow
 
diff --git a/lib/kernels/include/kernels/optimizer_kernels_cpu.h b/lib/kernels/include/kernels/optimizer_kernels_cpu.h
new file mode 100644
index 0000000000..1a7943f9ca
--- /dev/null
+++ b/lib/kernels/include/kernels/optimizer_kernels_cpu.h
@@ -0,0 +1,31 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_CPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_CPU_H
+
+#include "kernels/accessor.h"
+#include <cstddef>
+
+namespace FlexFlow {
+
+void cpu_sgd_update_task(float lr,
+                         float momentum,
+                         bool nesterov,
+                         float weight_decay,
+                         GenericTensorAccessorR const &weight_grad,
+                         GenericTensorAccessorW const &weight,
+                         std::optional<GenericTensorAccessorW> const &sgd_v);
+
+void cpu_adam_update_task(float alpha_t,
+                          float beta1,
+                          float beta2,
+                          float weight_decay,
+                          float epsilon,
+                          float const *weight_grad_ptr,
+                          size_t size,
+                          int num_replicas,
+                          float *weight_ptr,
+                          float *adam_v_ptr,
+                          float *adam_m_ptr);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/optimizer_kernels_gpu.h b/lib/kernels/include/kernels/optimizer_kernels_gpu.h
new file mode 100644
index 0000000000..3e2a65a638
--- /dev/null
+++ b/lib/kernels/include/kernels/optimizer_kernels_gpu.h
@@ -0,0 +1,59 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_GPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_GPU_H
+
+#include "kernels/device.h"
+#include "kernels/ff_handle.h"
+
+namespace FlexFlow {
+
+void gpu_sgd_ps_update_task(ffStream_t stream,
+                            float lr,
+                            float momentum,
+                            bool nesterov,
+                            float weight_decay,
+                            float const *weight_grad_ptr,
+                            size_t size,
+                            int num_replicas,
+                            float *weight_ptr,
+                            float *sgd_v_ptr);
+
+void gpu_sgd_nccl_update_task(ffStream_t stream,
+                              float lr,
+                              float momentum,
+                              bool nesterov,
+                              float weight_decay,
+                              PerDeviceFFHandle const &,
+                              float const *weight_grad_ptr,
+                              size_t size,
+                              float *weight_ptr,
+                              float *sgd_v_ptr);
+
+void gpu_adam_ps_update_task(ffStream_t stream,
+                             float alpha_t,
+                             float beta1,
+                             float beta2,
+                             float weight_decay,
+                             float epsilon,
+                             float const *weight_grad_ptr,
+                             size_t size,
+                             int num_replicas,
+                             float *weight_ptr,
+                             float *adam_v_ptr,
+                             float *adam_m_ptr);
+
+void gpu_adam_nccl_update_task(ffStream_t stream,
+                               float alpha_t,
+                               float beta1,
+                               float beta2,
+                               float weight_decay,
+                               float epsilon,
+                               PerDeviceFFHandle const &handle,
+                               float const *weight_grad_ptr,
+                               size_t size,
+                               float *weight_ptr,
+                               float *adam_v_ptr,
+                               float *adam_m_ptr);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/partition_kernels.h b/lib/kernels/include/kernels/partition_kernels.h
deleted file mode 100644
index aa3a7a1ef7..0000000000
--- a/lib/kernels/include/kernels/partition_kernels.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H
-#define _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H
-
-#include "kernels/accessor.h"
-#include "kernels/device.h"
-
-namespace FlexFlow {
-
-struct RepartitionPerDeviceState {
-  PerDeviceFFHandle handle;
-  req<DataType> data_type;
-};
-
-FF_VISITABLE_STRUCT_NO_EQ(RepartitionPerDeviceState, handle, data_type);
-
-namespace Kernels::Repartition {
-
-RepartitionPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
-                                      DataType data_type);
-
-void forward_kernel(ffStream_t stream,
-                    RepartitionPerDeviceState const &m,
-                    GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output);
-
-void backward_kernel(ffStream_t stream,
-                     RepartitionPerDeviceState const &m,
-                     GenericTensorAccessorR const &output_grad,
-                     GenericTensorAccessorW const &input_grad);
-
-} // namespace Kernels::Repartition
-} // namespace FlexFlow
-
-#endif // _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H
diff --git a/lib/kernels/include/kernels/partition_per_device_state.struct.toml b/lib/kernels/include/kernels/partition_per_device_state.struct.toml
new file mode 100644
index 0000000000..a008e422cd
--- /dev/null
+++ b/lib/kernels/include/kernels/partition_per_device_state.struct.toml
@@ -0,0 +1,16 @@
+namespace = "FlexFlow"
+name = "RepartitionPerDeviceState"
+features = []
+
+includes = [
+  "kernels/ff_handle.h",
+  "op-attrs/datatype.dtg.h",
+]
+
+[[fields]]
+name = "handle"
+type = "::FlexFlow::PerDeviceFFHandle"
+
+[[fields]]
+name = "data_type"
+type = "::FlexFlow::DataType"
diff --git a/lib/kernels/include/kernels/per_device_op_state.variant.toml b/lib/kernels/include/kernels/per_device_op_state.variant.toml
deleted file mode 100644
index 0171e3e497..0000000000
--- a/lib/kernels/include/kernels/per_device_op_state.variant.toml
+++ /dev/null
@@ -1,82 +0,0 @@
-namespace = "FlexFlow"
-name = "PerDeviceOpState"
-features = []
-
-includes = [
-  "kernels/attention_kernels.h",
-  "kernels/batch_norm_kernels.h",
-  "kernels/conv_2d_kernels.h",
-  "kernels/dropout_kernels.h",
-  "kernels/element_binary_kernels.h",
-  "kernels/element_unary_kernels.h",
-  "kernels/gather_kernels.h",
-  "kernels/layer_norm_kernels.h",
-  "kernels/linear_kernels.h",
-  "kernels/partition_kernels.h",
-  "kernels/pool_2d_kernels.h",
-  "kernels/reduce_kernels.h",
-  "kernels/reduction_kernels.h",
-  "kernels/reshape_kernels.h",
-  "kernels/softmax_kernels.h",
-  "kernels/topk_kernels.h",
-]
-
-[[values]]
-type = "::FlexFlow::MHAPerDeviceState"
-key = "mha_per_device_state"
-
-[[values]]
-type = "::FlexFlow::BatchNormPerDeviceState"
-key = "batch_norm_per_device_state"
-
-[[values]]
-type = "::FlexFlow::Conv2DPerDeviceState"
-key = "conv2d_per_device_state"
-
-[[values]]
-type = "::FlexFlow::DropoutPerDeviceState"
-key = "dropout_per_device_state"
-
-[[values]]
-type = "::FlexFlow::ElementBinaryPerDeviceState"
-key = "element_binary_per_device_state"
-
-[[values]]
-type = "::FlexFlow::ElementUnaryPerDeviceState"
-key = "element_unary_per_device_state"
-
-[[values]]
-type = "::FlexFlow::GatherPerDeviceState"
-key = "gather_per_device_state"
-
-[[values]]
-type = "::FlexFlow::LayerNormPerDeviceState"
-key = "layer_norm_per_device_state"
-
-[[values]]
-type = "::FlexFlow::LinearPerDeviceState"
-key = "linear_per_device_state"
-
-[[values]]
-type = "::FlexFlow::Pool2DPerDeviceState"
-key = "pool_2d_per_device_state"
-
-[[values]]
-type = "::FlexFlow::ReducePerDeviceState"
-key = "reduce_per_device_state"
-
-[[values]]
-type = "::FlexFlow::RepartitionPerDeviceState"
-key = "repartition_per_device_state"
-
-[[values]]
-type = "::FlexFlow::ReshapePerDeviceState"
-key = "reshape_per_device_state"
-
-[[values]]
-type = "::FlexFlow::SoftmaxPerDeviceState"
-key = "softmax_per_device_state"
-
-[[values]]
-type = "::FlexFlow::TopKPerDeviceState"
-key = "topk_per_device_state"
diff --git a/lib/kernels/include/kernels/pool_2d_kernels.h b/lib/kernels/include/kernels/pool_2d_kernels.h
index 76aa07d0a4..c18ff92289 100644
--- a/lib/kernels/include/kernels/pool_2d_kernels.h
+++ b/lib/kernels/include/kernels/pool_2d_kernels.h
@@ -1,80 +1,52 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H
 
-#include "kernels/device.h"
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/device_stream_t.dtg.h"
 #include "kernels/ff_handle.h"
+#include "kernels/pool_2d_per_device_state.dtg.h"
 #include "op-attrs/activation.dtg.h"
 #include "op-attrs/ops/pool_2d.h"
-#include "utils/visitable.h"
-
-namespace FlexFlow {
-
-struct Pool2DPerDeviceState {
-  PerDeviceFFHandle handle;
-  ffTensorDescriptor_t inputTensor, outputTensor;
-  ffActivationDescriptor_t actiDesc;
-  ffPoolingDescriptor_t poolDesc;
-  bool relu;
-};
-
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Pool2DPerDeviceState,
-                                             handle,
-                                             inputTensor,
-                                             outputTensor,
-                                             actiDesc,
-                                             poolDesc,
-                                             relu);
-
-namespace Kernels::Pool2D {
-
-Pool2DPerDeviceState init_kernel(PerDeviceFFHandle handle,
-                                 std::optional<Activation> activation,
-                                 int input_w,
-                                 int input_h,
-                                 int input_c,
-                                 int input_n,
-                                 int output_w,
-                                 int output_h,
-                                 int output_c,
-                                 int output_n,
-                                 int pad_h,
-                                 int pad_w,
-                                 int kernel_h,
-                                 int kernel_w,
-                                 int stride_h,
-                                 int stride_w,
-                                 PoolOp pool_type);
-
-void init_kernel(Pool2DPerDeviceState *m,
-                 int input_w,
-                 int input_h,
-                 int input_c,
-                 int input_n,
-                 int output_w,
-                 int output_h,
-                 int output_c,
-                 int output_n,
-                 int pad_h,
-                 int pad_w,
-                 int kernel_h,
-                 int kernel_w,
-                 int stride_h,
-                 int stride_w,
-                 PoolOp pool_type);
-
-void forward_kernel(ffStream_t stream,
-                    Pool2DPerDeviceState const &m,
+#include "pcg/device_type.dtg.h"
+
+namespace FlexFlow::Kernels::Pool2D {
+
+std::optional<Pool2DPerDeviceState>
+    init_kernel(DeviceType device_type,
+                device_handle_t const &handle,
+                std::optional<Activation> activation,
+                int input_w,
+                int input_h,
+                int input_c,
+                int input_n,
+                int output_w,
+                int output_h,
+                int output_c,
+                int output_n,
+                int pad_h,
+                int pad_w,
+                int kernel_h,
+                int kernel_w,
+                int stride_h,
+                int stride_w,
+                PoolOp pool_type);
+
+void forward_kernel(device_stream_t const &stream,
+                    std::optional<Pool2DPerDeviceState> const &per_device_state,
                     void const *input_ptr,
                     void *output_ptr);
 
-void backward_kernel(ffStream_t stream,
-                     Pool2DPerDeviceState const &m,
-                     void const *output_ptr,
-                     void const *output_grad_ptr,
-                     void const *input_ptr,
-                     void *input_grad_ptr);
+void backward_kernel(
+    device_stream_t const &stream,
+    std::optional<Pool2DPerDeviceState> const &per_device_state,
+    void const *output_ptr,
+    void const *output_grad_ptr,
+    void const *input_ptr,
+    void *input_grad_ptr);
+
+void cleanup_kernel(DeviceType device_type,
+                    std::optional<Pool2DPerDeviceState> &per_device_state);
 
-} // namespace Kernels::Pool2D
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Pool2D
 
 #endif // _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H
diff --git a/lib/kernels/include/kernels/pool_2d_kernels_cpu.h b/lib/kernels/include/kernels/pool_2d_kernels_cpu.h
new file mode 100644
index 0000000000..aa13e913b6
--- /dev/null
+++ b/lib/kernels/include/kernels/pool_2d_kernels_cpu.h
@@ -0,0 +1,15 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_POOL_2D_KERNELS_CPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_POOL_2D_KERNELS_CPU_H
+
+namespace FlexFlow::Kernels::Pool2D {
+
+void cpu_forward_kernel(void const *input_ptr, void *output_ptr);
+
+void cpu_backward_kernel(void const *output_ptr,
+                         void const *output_grad_ptr,
+                         void const *input_ptr,
+                         void *input_grad_ptr);
+
+} // namespace FlexFlow::Kernels::Pool2D
+
+#endif
diff --git a/lib/kernels/include/kernels/pool_2d_kernels_gpu.h b/lib/kernels/include/kernels/pool_2d_kernels_gpu.h
new file mode 100644
index 0000000000..8a1499e97e
--- /dev/null
+++ b/lib/kernels/include/kernels/pool_2d_kernels_gpu.h
@@ -0,0 +1,46 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_POOL_2D_KERNELS_GPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_POOL_2D_KERNELS_GPU_H
+
+#include "kernels/device.h"
+#include "kernels/ff_handle.h"
+#include "kernels/pool_2d_per_device_state.dtg.h"
+#include "op-attrs/activation.dtg.h"
+#include "op-attrs/ops/pool_2d.h"
+
+namespace FlexFlow::Kernels::Pool2D {
+
+Pool2DPerDeviceState gpu_init_kernel(PerDeviceFFHandle handle,
+                                     std::optional<Activation> activation,
+                                     int input_w,
+                                     int input_h,
+                                     int input_c,
+                                     int input_n,
+                                     int output_w,
+                                     int output_h,
+                                     int output_c,
+                                     int output_n,
+                                     int pad_h,
+                                     int pad_w,
+                                     int kernel_h,
+                                     int kernel_w,
+                                     int stride_h,
+                                     int stride_w,
+                                     PoolOp pool_type);
+
+void gpu_forward_kernel(ffStream_t stream,
+                        Pool2DPerDeviceState const &per_device_state,
+                        void const *input_ptr,
+                        void *output_ptr);
+
+void gpu_backward_kernel(ffStream_t stream,
+                         Pool2DPerDeviceState const &per_device_state,
+                         void const *output_ptr,
+                         void const *output_grad_ptr,
+                         void const *input_ptr,
+                         void *input_grad_ptr);
+
+void gpu_cleanup_kernel(Pool2DPerDeviceState &per_device_state);
+
+} // namespace FlexFlow::Kernels::Pool2D
+
+#endif
diff --git a/lib/kernels/include/kernels/pool_2d_per_device_state.struct.toml b/lib/kernels/include/kernels/pool_2d_per_device_state.struct.toml
new file mode 100644
index 0000000000..63e98cca85
--- /dev/null
+++ b/lib/kernels/include/kernels/pool_2d_per_device_state.struct.toml
@@ -0,0 +1,32 @@
+namespace = "FlexFlow"
+name = "Pool2DPerDeviceState"
+features = []
+
+includes = [
+  "kernels/ff_handle.h",
+  "kernels/device.h",
+]
+
+[[fields]]
+name = "handle"
+type = "::FlexFlow::PerDeviceFFHandle"
+
+[[fields]]
+name = "inputTensor"
+type = "ffTensorDescriptor_t"
+
+[[fields]]
+name = "outputTensor"
+type = "ffTensorDescriptor_t"
+
+[[fields]]
+name = "actiDesc"
+type = "ffActivationDescriptor_t"
+
+[[fields]]
+name = "poolDesc"
+type = "ffPoolingDescriptor_t"
+
+[[fields]]
+name = "relu"
+type = "bool"
diff --git a/lib/kernels/include/kernels/profiling.h b/lib/kernels/include/kernels/profiling.h
index 7c4145c426..c0a0e794e3 100644
--- a/lib/kernels/include/kernels/profiling.h
+++ b/lib/kernels/include/kernels/profiling.h
@@ -2,21 +2,26 @@
 #define _FLEXFLOW_KERNELS_PROFILING_H
 
 #include "kernels/device.h"
+#include "kernels/device_stream_t.h"
 #include "kernels/profiling_settings.dtg.h"
-#include "utils/visitable.h"
+#include "pcg/device_type.dtg.h"
+#include <libassert/assert.hpp>
 
 namespace FlexFlow {
 
 template <typename F, typename... Ts>
-std::optional<float>
-    profiling_wrapper(F const &f, bool enable_profiling, Ts &&...ts) {
+std::optional<float> profiling_wrapper(F const &f,
+                                       bool enable_profiling,
+                                       DeviceType device_type,
+                                       Ts &&...ts) {
   if (enable_profiling) {
-    ProfilingSettings settings = {0, 1};
+    ProfilingSettings settings = ProfilingSettings{
+        /*warmup_iters=*/0,
+        /*measure_iters=*/1,
+    };
     return profiling_wrapper<F, Ts...>(f, settings, std::forward<Ts>(ts)...);
   } else {
-    ffStream_t stream;
-    checkCUDA(get_legion_stream(&stream));
-    f(stream, std::forward<Ts>(ts)...);
+    f(get_stream_for_device_type(device_type), std::forward<Ts>(ts)...);
     return std::nullopt;
   }
 }
@@ -24,9 +29,54 @@ std::optional<float>
 template <typename F, typename... Ts>
 std::optional<float> profiling_wrapper(F const &f,
                                        ProfilingSettings const &settings,
+                                       DeviceType device_type,
                                        Ts &&...ts) {
-  ffStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
+  if (settings.measure_iters <= 0) {
+    return std::nullopt;
+  }
+
+  if (device_type == DeviceType::GPU) {
+    return gpu_profiling_wrapper(f, settings, std::forward<Ts>(ts)...);
+  } else {
+    ASSERT(device_type == DeviceType::CPU);
+    return cpu_profiling_wrapper(f, settings, std::forward<Ts>(ts)...);
+  }
+}
+
+template <typename F, typename... Ts>
+float cpu_profiling_wrapper(F const &f,
+                            ProfilingSettings const &settings,
+                            Ts &&...ts) {
+  ASSERT(settings.measure_iters > 0);
+
+  device_stream_t stream = get_cpu_device_stream();
+
+  using TimePoint = std::chrono::time_point<std::chrono::steady_clock>;
+
+  std::optional<TimePoint> start = std::nullopt;
+  std::optional<TimePoint> end = std::nullopt;
+
+  for (int i = 0; i < settings.warmup_iters + settings.measure_iters; i++) {
+    if (i == settings.warmup_iters) {
+      start = std::chrono::steady_clock::now();
+    }
+    f(stream, std::forward<Ts>(ts)...);
+  }
+  end = std::chrono::steady_clock::now();
+
+  std::chrono::duration<double, std::milli> avg_duration =
+      (end.value() - start.value()) / settings.measure_iters;
+
+  return avg_duration.count();
+}
+
+template <typename F, typename... Ts>
+float gpu_profiling_wrapper(F const &f,
+                            ProfilingSettings const &settings,
+                            Ts &&...ts) {
+  ASSERT(settings.measure_iters > 0);
+
+  device_stream_t stream = get_gpu_device_stream();
 
   ffEvent_t t_start, t_end;
   checkCUDA(ffEventCreate(&t_start));
@@ -34,18 +84,18 @@ std::optional<float> profiling_wrapper(F const &f,
 
   for (int i = 0; i < settings.warmup_iters + settings.measure_iters; i++) {
     if (i == settings.warmup_iters) {
-      checkCUDA(ffEventRecord(t_start, stream));
+      checkCUDA(ffEventRecord(t_start, stream.require_gpu()));
     }
     f(stream, std::forward<Ts>(ts)...);
   }
 
   float elapsed = 0;
-  checkCUDA(ffEventRecord(t_end, stream));
+  checkCUDA(ffEventRecord(t_end, stream.require_gpu()));
   checkCUDA(ffEventSynchronize(t_end));
   checkCUDA(ffEventElapsedTime(&elapsed, t_start, t_end));
   checkCUDA(ffEventDestroy(t_start));
   checkCUDA(ffEventDestroy(t_end));
-  return elapsed;
+  return elapsed / settings.measure_iters;
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/include/kernels/reduce_kernels.h b/lib/kernels/include/kernels/reduce_kernels.h
index 10e8e4393b..c890ab35c3 100644
--- a/lib/kernels/include/kernels/reduce_kernels.h
+++ b/lib/kernels/include/kernels/reduce_kernels.h
@@ -1,48 +1,35 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H
 
-#include "kernels/array_shape.h"
-#include "kernels/device.h"
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/device_stream_t.dtg.h"
 #include "kernels/ff_handle.h"
+#include "kernels/reduce_per_device_state.dtg.h"
 #include "op-attrs/operator_type.dtg.h"
+#include "op-attrs/tensor_shape.dtg.h"
+#include "pcg/device_type.dtg.h"
 
-namespace FlexFlow {
+namespace FlexFlow::Kernels::Reduce {
 
-struct ReducePerDeviceState {
-  PerDeviceFFHandle handle;
-  ffTensorDescriptor_t inputTensor;
-  ffTensorDescriptor_t outputTensor;
-  ffReduceTensorDescriptor_t reduceDesc;
-  OperatorType op_type;
-  req<size_t> reduction_size;
-};
+std::optional<ReducePerDeviceState>
+    init_kernel(DeviceType device_type,
+                device_handle_t const &handle,
+                OperatorType const &operator_type,
+                size_t const &reduction_size,
+                TensorShape const &input_shape,
+                TensorShape const &output_shape);
 
-FF_VISITABLE_STRUCT(ReducePerDeviceState,
-                    handle,
-                    inputTensor,
-                    outputTensor,
-                    reduceDesc,
-                    op_type,
-                    reduction_size);
-
-namespace Kernels::Reduce {
-
-ReducePerDeviceState init_kernel(PerDeviceFFHandle const &,
-                                 OperatorType const &,
-                                 size_t const &,
-                                 ArrayShape const &input_shape,
-                                 ArrayShape const &output_shape);
-
-void forward_kernel(ffStream_t stream,
-                    ReducePerDeviceState const &m,
+void forward_kernel(device_stream_t const &stream,
+                    std::optional<ReducePerDeviceState> const &per_device_state,
                     float const *input_ptr,
                     float *output_ptr);
 
-void backward_kernel(ffStream_t stream,
-                     ReducePerDeviceState const &m,
-                     float const *output_grad_ptr,
-                     float *input_grad_ptr);
-} // namespace Kernels::Reduce
-} // namespace FlexFlow
+void backward_kernel(
+    device_stream_t const &stream,
+    std::optional<ReducePerDeviceState> const &per_device_state,
+    float const *output_grad_ptr,
+    float *input_grad_ptr);
+
+} // namespace FlexFlow::Kernels::Reduce
 
 #endif // _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H
diff --git a/lib/kernels/include/kernels/reduce_kernels_cpu.h b/lib/kernels/include/kernels/reduce_kernels_cpu.h
new file mode 100644
index 0000000000..9b625f9304
--- /dev/null
+++ b/lib/kernels/include/kernels/reduce_kernels_cpu.h
@@ -0,0 +1,12 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REDUCE_KERNELS_CPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REDUCE_KERNELS_CPU_H
+
+namespace FlexFlow::Kernels::Reduce {
+
+void cpu_forward_kernel(float const *input_ptr, float *output_ptr);
+
+void cpu_backward_kernel(float const *output_grad_ptr, float *input_grad_ptr);
+
+} // namespace FlexFlow::Kernels::Reduce
+
+#endif
diff --git a/lib/kernels/include/kernels/reduce_kernels_gpu.h b/lib/kernels/include/kernels/reduce_kernels_gpu.h
new file mode 100644
index 0000000000..c0c06fe78b
--- /dev/null
+++ b/lib/kernels/include/kernels/reduce_kernels_gpu.h
@@ -0,0 +1,30 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REDUCE_KERNELS_GPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REDUCE_KERNELS_GPU_H
+
+#include "kernels/device.h"
+#include "kernels/ff_handle.h"
+#include "kernels/reduce_per_device_state.dtg.h"
+#include "op-attrs/operator_type.dtg.h"
+#include "op-attrs/tensor_shape.dtg.h"
+
+namespace FlexFlow::Kernels::Reduce {
+
+ReducePerDeviceState gpu_init_kernel(PerDeviceFFHandle const &,
+                                     OperatorType const &,
+                                     size_t const &,
+                                     TensorShape const &input_shape,
+                                     TensorShape const &output_shape);
+
+void gpu_forward_kernel(ffStream_t stream,
+                        ReducePerDeviceState const &m,
+                        float const *input_ptr,
+                        float *output_ptr);
+
+void gpu_backward_kernel(ffStream_t stream,
+                         ReducePerDeviceState const &m,
+                         float const *output_grad_ptr,
+                         float *input_grad_ptr);
+
+} // namespace FlexFlow::Kernels::Reduce
+
+#endif
diff --git a/lib/kernels/include/kernels/reduce_per_device_state.struct.toml b/lib/kernels/include/kernels/reduce_per_device_state.struct.toml
new file mode 100644
index 0000000000..e82099ad25
--- /dev/null
+++ b/lib/kernels/include/kernels/reduce_per_device_state.struct.toml
@@ -0,0 +1,33 @@
+namespace = "FlexFlow"
+name = "ReducePerDeviceState"
+features = []
+
+includes = [
+  "kernels/device.h",
+  "kernels/ff_handle.h",
+  "op-attrs/operator_type.dtg.h",
+]
+
+[[fields]]
+name = "handle"
+type = "::FlexFlow::PerDeviceFFHandle"
+
+[[fields]]
+name = "inputTensor"
+type = "ffTensorDescriptor_t"
+
+[[fields]]
+name = "outputTensor"
+type = "ffTensorDescriptor_t"
+
+[[fields]]
+name = "reduceDesc"
+type = "ffReduceTensorDescriptor_t"
+
+[[fields]]
+name = "op_type"
+type = "::FlexFlow::OperatorType"
+
+[[fields]]
+name = "reduction_size"
+type = "size_t"
diff --git a/lib/kernels/include/kernels/reduce_tensor_accessor.h b/lib/kernels/include/kernels/reduce_tensor_accessor.h
index d803c7ef9b..a06afbf5f6 100644
--- a/lib/kernels/include/kernels/reduce_tensor_accessor.h
+++ b/lib/kernels/include/kernels/reduce_tensor_accessor.h
@@ -3,10 +3,11 @@
 
 #include "kernels/accessor.h"
 #include "kernels/allocation.h"
-#include "kernels/array_coord.h"
 #include "kernels/copy_tensor_accessor.h"
 #include "kernels/datatype_dispatch.h"
 #include "kernels/local_cpu_allocator.h"
+#include "op-attrs/tensor_dims_coord.h"
+#include "op-attrs/tensor_shape.h"
 #include "utils/containers/contains.h"
 #include "utils/containers/foldl1.h"
 #include "utils/containers/foldr1.h"
@@ -32,18 +33,18 @@ struct CPUReduceTensorAccessorInDims {
       return contains(dims_to_reduce, dim);
     };
 
-    std::unordered_map<ArrayCoord, std::unordered_set<ArrayCoord>>
+    std::unordered_map<TensorDimsCoord, std::unordered_set<TensorDimsCoord>>
         output_coord_from_input_coord = group_by(
-            get_array_coord_set(input.shape),
-            [&](ArrayCoord const &input_coord) {
-              return array_coord_drop_dims(input_coord, should_drop_dim);
+            get_tensor_dims_coord_set(input.shape.dims),
+            [&](TensorDimsCoord const &input_coord) {
+              return tensor_dims_coord_drop_dims(input_coord, should_drop_dim);
             });
 
     for (auto const &[output_coord, input_coords] :
          output_coord_from_input_coord) {
       std::vector<T> input_values = transform(
-          sorted(input_coords), [&](ArrayCoord const &input_coord) -> T {
-            return input.at<DT>(input_coord.ff_ordered);
+          sorted(input_coords), [&](TensorDimsCoord const &input_coord) -> T {
+            return input.at<DT>(input_coord);
           });
 
       T result = foldl1(input_values, f);
@@ -51,7 +52,7 @@ struct CPUReduceTensorAccessorInDims {
                return f(elem, accum);
              }));
 
-      output.at<DT>(output_coord.ff_ordered) = result;
+      output.at<DT>(output_coord) = result;
     }
   }
 };
@@ -71,13 +72,13 @@ GenericTensorAccessorW
     return contains(dims, dim);
   };
 
-  ArrayShape reduced_shape =
-      array_shape_drop_dims(input.shape, should_drop_dim);
-  GenericTensorAccessorW output_cpu = cpu_allocator.allocate_tensor(
-      get_tensor_shape(reduced_shape, input.data_type));
+  TensorShape reduced_shape =
+      tensor_shape_drop_dims(input.shape, should_drop_dim);
+  GenericTensorAccessorW output_cpu =
+      cpu_allocator.allocate_tensor(reduced_shape);
 
   DataTypeDispatch1<CPUReduceTensorAccessorInDims>{}(
-      input_cpu.data_type, input_cpu, output_cpu, dims, f);
+      input_cpu.shape.data_type, input_cpu, output_cpu, dims, f);
 
   return copy_tensor_accessor_w(output_cpu, output_allocator);
 }
@@ -88,7 +89,7 @@ real_type_t<DT>
                                        F &&f) {
   Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
-  std::unordered_set<ff_dim_t> input_dims = get_ff_dim_t_set(input.shape);
+  std::unordered_set<ff_dim_t> input_dims = get_ff_dim_t_set(input.shape.dims);
   GenericTensorAccessorW reduced =
       reduce_tensor_accessor_in_dims(input, input_dims, cpu_allocator, f);
 
diff --git a/lib/kernels/include/kernels/reduction_kernels.h b/lib/kernels/include/kernels/reduction_kernels.h
deleted file mode 100644
index 08f73cd9ab..0000000000
--- a/lib/kernels/include/kernels/reduction_kernels.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H
-#define _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H
-
-#include "kernels/accessor.h"
-#include "kernels/device.h"
-
-namespace FlexFlow::Kernels::Reduction {
-
-void forward_kernel(ffStream_t stream,
-                    GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output,
-                    size_t num_replicas);
-
-void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorR const &output,
-                     GenericTensorAccessorW const &input);
-
-} // namespace FlexFlow::Kernels::Reduction
-
-#endif // _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H
diff --git a/lib/kernels/include/kernels/replicate_kernels.h b/lib/kernels/include/kernels/replicate_kernels.h
deleted file mode 100644
index 0b113868ee..0000000000
--- a/lib/kernels/include/kernels/replicate_kernels.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H
-#define _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H
-
-#include "kernels/accessor.h"
-#include "kernels/device.h"
-
-namespace FlexFlow::Kernels::Replicate {
-
-void forward_kernel(ffStream_t stream,
-                    GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output);
-
-void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorR const &output,
-                     GenericTensorAccessorW const &input,
-                     size_t num_replicas);
-
-} // namespace FlexFlow::Kernels::Replicate
-
-#endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H
diff --git a/lib/kernels/include/kernels/replicate_kernels_cpu.h b/lib/kernels/include/kernels/replicate_kernels_cpu.h
deleted file mode 100644
index 2a2eaa5eb6..0000000000
--- a/lib/kernels/include/kernels/replicate_kernels_cpu.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H
-#define _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H
-
-#include "kernels/accessor.h"
-#include "kernels/device.h"
-
-namespace FlexFlow::Kernels::Replicate {
-
-void cpu_forward_kernel(GenericTensorAccessorR const &input,
-                        GenericTensorAccessorW &output);
-
-void cpu_backward_kernel(GenericTensorAccessorR const &output,
-                         GenericTensorAccessorW &input,
-                         size_t num_replicas);
-
-} // namespace FlexFlow::Kernels::Replicate
-
-#endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H
diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h
index 88c11d2fb0..310b349473 100644
--- a/lib/kernels/include/kernels/reshape_kernels.h
+++ b/lib/kernels/include/kernels/reshape_kernels.h
@@ -2,32 +2,18 @@
 #define _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H
 
 #include "kernels/accessor.h"
-#include "kernels/device.h"
-#include "utils/required_core.h"
+#include "kernels/device_stream_t.dtg.h"
 
-namespace FlexFlow {
+namespace FlexFlow::Kernels::Reshape {
 
-struct ReshapePerDeviceState {
-  req<DataType> data_type;
-};
-
-FF_VISITABLE_STRUCT(ReshapePerDeviceState, data_type);
-
-namespace Kernels::Reshape {
-
-ReshapePerDeviceState init_kernel(DataType data_type);
-
-void forward_kernel(ffStream_t stream,
-                    ReshapePerDeviceState const &per_device_state,
+void forward_kernel(device_stream_t const &stream,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorW const &output);
 
-void backward_kernel(ffStream_t stream,
-                     ReshapePerDeviceState const &per_device_state,
+void backward_kernel(device_stream_t const &stream,
                      GenericTensorAccessorR const &output,
                      GenericTensorAccessorW const &input);
 
-} // namespace Kernels::Reshape
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Reshape
 
 #endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H
diff --git a/lib/kernels/include/kernels/reshape_kernels_cpu.h b/lib/kernels/include/kernels/reshape_kernels_cpu.h
new file mode 100644
index 0000000000..a81ea639f6
--- /dev/null
+++ b/lib/kernels/include/kernels/reshape_kernels_cpu.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_RESHAPE_KERNELS_CPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_RESHAPE_KERNELS_CPU_H
+
+#include "kernels/accessor.h"
+
+namespace FlexFlow::Kernels::Reshape {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output);
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output,
+                         GenericTensorAccessorW const &input);
+
+} // namespace FlexFlow::Kernels::Reshape
+
+#endif
diff --git a/lib/kernels/include/kernels/reshape_kernels_gpu.h b/lib/kernels/include/kernels/reshape_kernels_gpu.h
new file mode 100644
index 0000000000..1454ce56ee
--- /dev/null
+++ b/lib/kernels/include/kernels/reshape_kernels_gpu.h
@@ -0,0 +1,19 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_RESHAPE_KERNELS_GPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_RESHAPE_KERNELS_GPU_H
+
+#include "kernels/accessor.h"
+#include "kernels/device.h"
+
+namespace FlexFlow::Kernels::Reshape {
+
+void gpu_forward_kernel(ffStream_t stream,
+                        GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output);
+
+void gpu_backward_kernel(ffStream_t stream,
+                         GenericTensorAccessorR const &output,
+                         GenericTensorAccessorW const &input);
+
+} // namespace FlexFlow::Kernels::Reshape
+
+#endif
diff --git a/lib/kernels/include/kernels/reverse_kernels.h b/lib/kernels/include/kernels/reverse_kernels.h
index 768707175c..bfef26798c 100644
--- a/lib/kernels/include/kernels/reverse_kernels.h
+++ b/lib/kernels/include/kernels/reverse_kernels.h
@@ -1,17 +1,18 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H
 
-#include "kernels/device.h"
-#include "kernels/reverse_kernels_cpu.h"
+#include "kernels/accessor.h"
+#include "kernels/device_stream_t.dtg.h"
+#include "op-attrs/ops/reverse_attrs.dtg.h"
 
 namespace FlexFlow::Kernels::Reverse {
 
-void forward_kernel(ffStream_t stream,
+void forward_kernel(device_stream_t const &stream,
                     GenericTensorAccessorR const &input_accessor,
                     GenericTensorAccessorW &output_accessor,
                     ReverseAttrs const &);
 
-void backward_kernel(ffStream_t stream,
+void backward_kernel(device_stream_t const &stream,
                      GenericTensorAccessorR const &output_accessor,
                      GenericTensorAccessorW &input_accessor,
                      ReverseAttrs const &);
diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h
index ec82000f8f..582b167d67 100644
--- a/lib/kernels/include/kernels/reverse_kernels_cpu.h
+++ b/lib/kernels/include/kernels/reverse_kernels_cpu.h
@@ -2,7 +2,6 @@
 #define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H
 
 #include "kernels/accessor.h"
-#include "kernels/device.h"
 #include "op-attrs/ops/reverse_attrs.dtg.h"
 
 namespace FlexFlow::Kernels::Reverse {
diff --git a/lib/kernels/include/kernels/reverse_kernels_gpu.h b/lib/kernels/include/kernels/reverse_kernels_gpu.h
new file mode 100644
index 0000000000..32f256392f
--- /dev/null
+++ b/lib/kernels/include/kernels/reverse_kernels_gpu.h
@@ -0,0 +1,22 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REVERSE_KERNELS_GPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REVERSE_KERNELS_GPU_H
+
+#include "kernels/accessor.h"
+#include "kernels/device.h"
+#include "op-attrs/ops/reverse_attrs.dtg.h"
+
+namespace FlexFlow::Kernels::Reverse {
+
+void gpu_forward_kernel(ffStream_t stream,
+                        GenericTensorAccessorR const &input_accessor,
+                        GenericTensorAccessorW &output_accessor,
+                        ReverseAttrs const &);
+
+void gpu_backward_kernel(ffStream_t stream,
+                         GenericTensorAccessorR const &output_accessor,
+                         GenericTensorAccessorW &input_accessor,
+                         ReverseAttrs const &);
+
+} // namespace FlexFlow::Kernels::Reverse
+
+#endif
diff --git a/lib/kernels/include/kernels/reverse_kernels_params.h b/lib/kernels/include/kernels/reverse_kernels_params.h
index 766d70b915..a2611f5aef 100644
--- a/lib/kernels/include/kernels/reverse_kernels_params.h
+++ b/lib/kernels/include/kernels/reverse_kernels_params.h
@@ -1,14 +1,14 @@
 #ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REVERSE_KERNELS_PARAMS_H
 #define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REVERSE_KERNELS_PARAMS_H
 
-#include "kernels/array_shape.h"
 #include "kernels/reverse_kernels_params.dtg.h"
 #include "op-attrs/ops/reverse_attrs.dtg.h"
+#include "op-attrs/tensor_dims.dtg.h"
 
 namespace FlexFlow {
 
 ReverseKernelsParams
-    compute_reverse_kernels_params(ArrayShape const &output_shape,
+    compute_reverse_kernels_params(TensorDims const &output_dims,
                                    ReverseAttrs const &attrs);
 
 } // namespace FlexFlow
diff --git a/lib/kernels/include/kernels/softmax_kernels.h b/lib/kernels/include/kernels/softmax_kernels.h
index 60101578e3..23f0ff879d 100644
--- a/lib/kernels/include/kernels/softmax_kernels.h
+++ b/lib/kernels/include/kernels/softmax_kernels.h
@@ -1,40 +1,36 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_SOFTMAX_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_SOFTMAX_KERNELS_H
 
-#include "ff_handle.h"
-#include "kernels/device.h"
-
-namespace FlexFlow {
-
-// Note(lambda): SoftmaxPerDeviceState may need add more elements
-struct SoftmaxPerDeviceState {
-  PerDeviceFFHandle handle;
-  ffTensorDescriptor_t inputTensor;
-  req<int> dim;
-};
-
-FF_VISITABLE_STRUCT(SoftmaxPerDeviceState, handle, inputTensor, dim);
-
-namespace Kernels::Softmax {
-
-SoftmaxPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
-                                  int dim,
-                                  int input_n,
-                                  int input_c,
-                                  int input_h,
-                                  int input_w);
-
-void forward_kernel(ffStream_t stream,
-                    SoftmaxPerDeviceState const &m,
-                    float const *input_ptr,
-                    float *output_ptr);
-
-void backward_kernel(ffStream_t stream,
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/device_stream_t.dtg.h"
+#include "kernels/ff_handle.h"
+#include "kernels/softmax_per_device_state.dtg.h"
+#include "pcg/device_type.dtg.h"
+
+namespace FlexFlow::Kernels::Softmax {
+
+std::optional<SoftmaxPerDeviceState> init_kernel(DeviceType device_type,
+                                                 device_handle_t const &handle,
+                                                 ff_dim_t dim,
+                                                 int input_n,
+                                                 int input_c,
+                                                 int input_h,
+                                                 int input_w);
+
+void forward_kernel(
+    device_stream_t const &stream,
+    std::optional<SoftmaxPerDeviceState> const &per_device_state,
+    float const *input_ptr,
+    float *output_ptr);
+
+void backward_kernel(device_stream_t const &stream,
                      float const *output_grad_ptr,
                      float *input_grad_ptr,
                      size_t num_elements);
 
-} // namespace Kernels::Softmax
-} // namespace FlexFlow
+void cleanup_kernel(DeviceType device_type,
+                    std::optional<SoftmaxPerDeviceState> &per_device_state);
+
+} // namespace FlexFlow::Kernels::Softmax
 
 #endif
diff --git a/lib/kernels/include/kernels/softmax_kernels_cpu.h b/lib/kernels/include/kernels/softmax_kernels_cpu.h
new file mode 100644
index 0000000000..536a28e62c
--- /dev/null
+++ b/lib/kernels/include/kernels/softmax_kernels_cpu.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_SOFTMAX_KERNELS_CPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_SOFTMAX_KERNELS_CPU_H
+
+#include <cstddef>
+
+namespace FlexFlow::Kernels::Softmax {
+
+void cpu_forward_kernel(float const *input_ptr, float *output_ptr);
+
+void cpu_backward_kernel(float const *output_grad_ptr,
+                         float *input_grad_ptr,
+                         size_t num_elements);
+
+} // namespace FlexFlow::Kernels::Softmax
+
+#endif
diff --git a/lib/kernels/include/kernels/softmax_kernels_gpu.h b/lib/kernels/include/kernels/softmax_kernels_gpu.h
new file mode 100644
index 0000000000..16e98857f4
--- /dev/null
+++ b/lib/kernels/include/kernels/softmax_kernels_gpu.h
@@ -0,0 +1,32 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_SOFTMAX_KERNELS_GPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_SOFTMAX_KERNELS_GPU_H
+
+#include "kernels/device.h"
+#include "kernels/ff_handle.h"
+#include "kernels/softmax_per_device_state.dtg.h"
+#include "op-attrs/ff_dim_t.dtg.h"
+
+namespace FlexFlow::Kernels::Softmax {
+
+SoftmaxPerDeviceState gpu_init_kernel(PerDeviceFFHandle const &handle,
+                                      ff_dim_t dim,
+                                      int input_n,
+                                      int input_c,
+                                      int input_h,
+                                      int input_w);
+
+void gpu_forward_kernel(ffStream_t stream,
+                        SoftmaxPerDeviceState const &per_device_state,
+                        float const *input_ptr,
+                        float *output_ptr);
+
+void gpu_backward_kernel(ffStream_t stream,
+                         float const *output_grad_ptr,
+                         float *input_grad_ptr,
+                         size_t num_elements);
+
+void gpu_cleanup_kernel(SoftmaxPerDeviceState &per_device_state);
+
+} // namespace FlexFlow::Kernels::Softmax
+
+#endif
diff --git a/lib/kernels/include/kernels/softmax_per_device_state.struct.toml b/lib/kernels/include/kernels/softmax_per_device_state.struct.toml
new file mode 100644
index 0000000000..374dd28c63
--- /dev/null
+++ b/lib/kernels/include/kernels/softmax_per_device_state.struct.toml
@@ -0,0 +1,21 @@
+namespace = "FlexFlow"
+name = "SoftmaxPerDeviceState"
+features = []
+
+includes = [
+  "kernels/ff_handle.h",
+  "kernels/device.h",
+  "op-attrs/ff_dim_t.dtg.h",
+]
+
+[[fields]]
+name = "handle"
+type = "::FlexFlow::PerDeviceFFHandle"
+
+[[fields]]
+name = "inputTensor"
+type = "ffTensorDescriptor_t"
+
+[[fields]]
+name = "dim"
+type = "::FlexFlow::ff_dim_t"
diff --git a/lib/kernels/include/kernels/split_kernels.h b/lib/kernels/include/kernels/split_kernels.h
index 3b580f94be..6c3d576f29 100644
--- a/lib/kernels/include/kernels/split_kernels.h
+++ b/lib/kernels/include/kernels/split_kernels.h
@@ -1,22 +1,24 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H
 
-#include "kernels/device.h"
+#include "kernels/device_stream_t.dtg.h"
 
 namespace FlexFlow::Kernels::Split {
-void forward_kernel(ffStream_t stream,
+
+void forward_kernel(device_stream_t const &stream,
                     float **out_ptrs,
                     float const *in_ptr,
-                    coord_t const *out_blk_sizes,
-                    coord_t in_blk_size,
-                    coord_t num_blks,
+                    int const *out_blk_sizes,
+                    int in_blk_size,
+                    int num_blks,
                     int numOutputs);
-void backward_kernel(ffStream_t stream,
+
+void backward_kernel(device_stream_t const &stream,
                      float *in_grad_ptr,
                      float const **out_grad_ptr,
-                     coord_t const *out_blk_sizes,
-                     coord_t in_blk_size,
-                     coord_t num_blks,
+                     int const *out_blk_sizes,
+                     int in_blk_size,
+                     int num_blks,
                      int numOutputs);
 
 } // namespace FlexFlow::Kernels::Split
diff --git a/lib/kernels/include/kernels/split_kernels_cpu.h b/lib/kernels/include/kernels/split_kernels_cpu.h
new file mode 100644
index 0000000000..7f50804dff
--- /dev/null
+++ b/lib/kernels/include/kernels/split_kernels_cpu.h
@@ -0,0 +1,22 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_SPLIT_KERNELS_CPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_SPLIT_KERNELS_CPU_H
+
+namespace FlexFlow::Kernels::Split {
+
+void cpu_forward_kernel(float **out_ptrs,
+                        float const *in_ptr,
+                        int const *out_blk_sizes,
+                        int in_blk_size,
+                        int num_blks,
+                        int numOutputs);
+
+void cpu_backward_kernel(float *in_grad_ptr,
+                         float const **out_grad_ptr,
+                         int const *out_blk_sizes,
+                         int in_blk_size,
+                         int num_blks,
+                         int numOutputs);
+
+} // namespace FlexFlow::Kernels::Split
+
+#endif
diff --git a/lib/kernels/include/kernels/split_kernels_gpu.h b/lib/kernels/include/kernels/split_kernels_gpu.h
new file mode 100644
index 0000000000..e6bfc5454c
--- /dev/null
+++ b/lib/kernels/include/kernels/split_kernels_gpu.h
@@ -0,0 +1,26 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_SPLIT_KERNELS_GPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_SPLIT_KERNELS_GPU_H
+
+#include "kernels/device.h"
+
+namespace FlexFlow::Kernels::Split {
+
+void gpu_forward_kernel(ffStream_t stream,
+                        float **out_ptrs,
+                        float const *in_ptr,
+                        int const *out_blk_sizes,
+                        int in_blk_size,
+                        int num_blks,
+                        int numOutputs);
+
+void gpu_backward_kernel(ffStream_t stream,
+                         float *in_grad_ptr,
+                         float const **out_grad_ptr,
+                         int const *out_blk_sizes,
+                         int in_blk_size,
+                         int num_blks,
+                         int numOutputs);
+
+} // namespace FlexFlow::Kernels::Split
+
+#endif
diff --git a/lib/kernels/include/kernels/tensor_accessor_binary_ops.h b/lib/kernels/include/kernels/tensor_accessor_binary_ops.h
new file mode 100644
index 0000000000..dde51b3266
--- /dev/null
+++ b/lib/kernels/include/kernels/tensor_accessor_binary_ops.h
@@ -0,0 +1,48 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TENSOR_ACCESSOR_BINARY_OPS_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TENSOR_ACCESSOR_BINARY_OPS_H
+
+#include "kernels/accessor.h"
+#include "kernels/allocation.h"
+
+namespace FlexFlow {
+
+GenericTensorAccessorW
+    tensor_accessor_elementwise_add(GenericTensorAccessorR const &lhs,
+                                    GenericTensorAccessorR const &rhs,
+                                    Allocator &output_allocator);
+
+void tensor_accessor_elementwise_add_to(GenericTensorAccessorR const &lhs,
+                                        GenericTensorAccessorR const &rhs,
+                                        GenericTensorAccessorW const &output);
+
+GenericTensorAccessorW
+    tensor_accessor_elementwise_subtract(GenericTensorAccessorR const &lhs,
+                                         GenericTensorAccessorR const &rhs,
+                                         Allocator &output_allocator);
+
+void tensor_accessor_elementwise_subtract_to(
+    GenericTensorAccessorR const &lhs,
+    GenericTensorAccessorR const &rhs,
+    GenericTensorAccessorW const &output);
+
+GenericTensorAccessorW
+    tensor_accessor_elementwise_multiply(GenericTensorAccessorR const &lhs,
+                                         GenericTensorAccessorR const &rhs,
+                                         Allocator &output_allocator);
+
+void tensor_accessor_elementwise_multiply_to(
+    GenericTensorAccessorR const &lhs,
+    GenericTensorAccessorR const &rhs,
+    GenericTensorAccessorW const &output);
+
+GenericTensorAccessorW tensor_accessor_matmul(GenericTensorAccessorR const &lhs,
+                                              GenericTensorAccessorR const &rhs,
+                                              Allocator &output_allocator);
+
+void tensor_accessor_matmul_to(GenericTensorAccessorR const &lhs,
+                               GenericTensorAccessorR const &rhs,
+                               GenericTensorAccessorW const &output);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/tensor_accessor_unary_ops.h b/lib/kernels/include/kernels/tensor_accessor_unary_ops.h
new file mode 100644
index 0000000000..b7bb561e4a
--- /dev/null
+++ b/lib/kernels/include/kernels/tensor_accessor_unary_ops.h
@@ -0,0 +1,50 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TENSOR_ACCESSOR_UNARY_OPS_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TENSOR_ACCESSOR_UNARY_OPS_H
+
+#include "kernels/accessor.h"
+#include "kernels/allocation.h"
+
+namespace FlexFlow {
+
+GenericTensorAccessorW
+    tensor_accessor_scale_by_constant(GenericTensorAccessorR const &input,
+                                      float constant,
+                                      Allocator &output_allocator);
+
+void tensor_accessor_scale_by_constant_inplace(
+    GenericTensorAccessorW const &input, float constant);
+
+GenericTensorAccessorW tensor_accessor_relu(GenericTensorAccessorR const &input,
+                                            Allocator &output_allocator);
+
+void tensor_accessor_relu_to(GenericTensorAccessorR const &input,
+                             GenericTensorAccessorW const &output);
+
+GenericTensorAccessorW
+    tensor_accessor_broadcast(GenericTensorAccessorR const &input,
+                              TensorDims const &output_dims,
+                              Allocator &output_allocator);
+
+void tensor_accessor_broadcast_to(GenericTensorAccessorR const &input,
+                                  TensorDims const &output_dims,
+                                  GenericTensorAccessorW const &output);
+
+GenericTensorAccessorW
+    tensor_accessor_transpose(GenericTensorAccessorR const &input,
+                              Allocator &output_allocator);
+
+void tensor_accessor_transpose_to(GenericTensorAccessorR const &input,
+                                  GenericTensorAccessorW const &output);
+
+GenericTensorAccessorW
+    tensor_accessor_reduce(GenericTensorAccessorR const &input,
+                           ff_dim_t dim,
+                           Allocator &output_allocator);
+
+void tensor_accessor_reduce_to(GenericTensorAccessorR const &input,
+                               ff_dim_t dim,
+                               GenericTensorAccessorW const &output);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/topk_kernels.h b/lib/kernels/include/kernels/topk_kernels.h
index 085594d57f..af9de11736 100644
--- a/lib/kernels/include/kernels/topk_kernels.h
+++ b/lib/kernels/include/kernels/topk_kernels.h
@@ -1,23 +1,11 @@
 #ifndef _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H
 #define _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H
 
-#include "kernels/allocation.h"
-#include "kernels/device.h"
+#include "kernels/device_stream_t.dtg.h"
 
-namespace FlexFlow {
+namespace FlexFlow::Kernels::TopK {
 
-struct TopKPerDeviceState {
-  req<bool> sorted; // Note: Does TopK needs a PerDeviceFFHandle handle?
-};
-
-FF_VISITABLE_STRUCT(TopKPerDeviceState, sorted);
-
-namespace Kernels::TopK {
-
-TopKPerDeviceState init_kernel(bool sorted);
-
-void forward_kernel(ffStream_t stream,
-                    TopKPerDeviceState const &m,
+void forward_kernel(device_stream_t const &stream,
                     float const *input_ptr,
                     float *output_ptr,
                     int *indices_ptr,
@@ -25,8 +13,8 @@ void forward_kernel(ffStream_t stream,
                     int length,
                     int k,
                     bool sorted);
-void backward_kernel(ffStream_t stream,
-                     TopKPerDeviceState const &m,
+
+void backward_kernel(device_stream_t const &stream,
                      float const *out_grad_ptr,
                      int const *indices_ptr,
                      float *in_grad_ptr,
@@ -34,7 +22,6 @@ void backward_kernel(ffStream_t stream,
                      int length,
                      int k);
 
-} // namespace Kernels::TopK
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::TopK
 
 #endif // _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H
diff --git a/lib/kernels/include/kernels/topk_kernels_cpu.h b/lib/kernels/include/kernels/topk_kernels_cpu.h
new file mode 100644
index 0000000000..a3764c40dd
--- /dev/null
+++ b/lib/kernels/include/kernels/topk_kernels_cpu.h
@@ -0,0 +1,25 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TOPK_KERNELS_CPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TOPK_KERNELS_CPU_H
+
+#include <cstddef>
+
+namespace FlexFlow::Kernels::TopK {
+
+void cpu_forward_kernel(float const *input_ptr,
+                        float *output_ptr,
+                        int *indices_ptr,
+                        size_t batch_size,
+                        int length,
+                        int k,
+                        bool sorted);
+
+void cpu_backward_kernel(float const *out_grad_ptr,
+                         int const *indices_ptr,
+                         float *in_grad_ptr,
+                         size_t batch_size,
+                         int length,
+                         int k);
+
+} // namespace FlexFlow::Kernels::TopK
+
+#endif
diff --git a/lib/kernels/include/kernels/topk_kernels_gpu.h b/lib/kernels/include/kernels/topk_kernels_gpu.h
new file mode 100644
index 0000000000..e669e79048
--- /dev/null
+++ b/lib/kernels/include/kernels/topk_kernels_gpu.h
@@ -0,0 +1,27 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TOPK_KERNELS_GPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TOPK_KERNELS_GPU_H
+
+#include "kernels/device.h"
+
+namespace FlexFlow::Kernels::TopK {
+
+void gpu_forward_kernel(ffStream_t stream,
+                        float const *input_ptr,
+                        float *output_ptr,
+                        int *indices_ptr,
+                        size_t batch_size,
+                        int length,
+                        int k,
+                        bool sorted);
+
+void gpu_backward_kernel(ffStream_t stream,
+                         float const *out_grad_ptr,
+                         int const *indices_ptr,
+                         float *in_grad_ptr,
+                         size_t batch_size,
+                         int length,
+                         int k);
+
+} // namespace FlexFlow::Kernels::TopK
+
+#endif
diff --git a/lib/kernels/include/kernels/transpose_kernels.h b/lib/kernels/include/kernels/transpose_kernels.h
index 776370dcbd..96b0a9c4aa 100644
--- a/lib/kernels/include/kernels/transpose_kernels.h
+++ b/lib/kernels/include/kernels/transpose_kernels.h
@@ -2,25 +2,21 @@
 #define _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H
 
 #include "kernels/accessor.h"
-#include "kernels/device.h"
+#include "kernels/device_stream_t.dtg.h"
 #include "op-attrs/ops/transpose_attrs.dtg.h"
-#include <vector>
 
-namespace FlexFlow {
+namespace FlexFlow::Kernels::Transpose {
 
-namespace Kernels::Transpose {
-
-void forward_kernel(cudaStream_t stream,
+void forward_kernel(device_stream_t const &stream,
                     TransposeAttrs const &attrs,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorW const &output);
 
-void backward_kernel(cudaStream_t stream,
+void backward_kernel(device_stream_t const &stream,
                      TransposeAttrs const &attrs,
                      GenericTensorAccessorR const &out_grad,
                      GenericTensorAccessorW const &in_grad);
 
-} // namespace Kernels::Transpose
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::Transpose
 
 #endif // _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H
diff --git a/lib/kernels/include/kernels/transpose_kernels_cpu.h b/lib/kernels/include/kernels/transpose_kernels_cpu.h
new file mode 100644
index 0000000000..dd8963d5e4
--- /dev/null
+++ b/lib/kernels/include/kernels/transpose_kernels_cpu.h
@@ -0,0 +1,19 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TRANSPOSE_KERNELS_CPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TRANSPOSE_KERNELS_CPU_H
+
+#include "kernels/accessor.h"
+#include "op-attrs/ops/transpose_attrs.dtg.h"
+
+namespace FlexFlow::Kernels::Transpose {
+
+void cpu_forward_kernel(TransposeAttrs const &attrs,
+                        GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output);
+
+void cpu_backward_kernel(TransposeAttrs const &attrs,
+                         GenericTensorAccessorR const &out_grad,
+                         GenericTensorAccessorW const &in_grad);
+
+} // namespace FlexFlow::Kernels::Transpose
+
+#endif
diff --git a/lib/kernels/include/kernels/transpose_kernels_gpu.h b/lib/kernels/include/kernels/transpose_kernels_gpu.h
new file mode 100644
index 0000000000..67f6e48665
--- /dev/null
+++ b/lib/kernels/include/kernels/transpose_kernels_gpu.h
@@ -0,0 +1,22 @@
+#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TRANSPOSE_KERNELS_GPU_H
+#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TRANSPOSE_KERNELS_GPU_H
+
+#include "kernels/accessor.h"
+#include "kernels/device.h"
+#include "op-attrs/ops/transpose_attrs.dtg.h"
+
+namespace FlexFlow::Kernels::Transpose {
+
+void gpu_forward_kernel(ffStream_t stream,
+                        TransposeAttrs const &attrs,
+                        GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output);
+
+void gpu_backward_kernel(ffStream_t stream,
+                         TransposeAttrs const &attrs,
+                         GenericTensorAccessorR const &out_grad,
+                         GenericTensorAccessorW const &in_grad);
+
+} // namespace FlexFlow::Kernels::Transpose
+
+#endif
diff --git a/lib/kernels/src/cpu/ops/combine_kernels.cc b/lib/kernels/src/cpu/ops/combine_kernels.cc
deleted file mode 100644
index c0c856ae5b..0000000000
--- a/lib/kernels/src/cpu/ops/combine_kernels.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-#include "kernels/combine_kernels_cpu.h"
-#include "kernels/datatype_dispatch.h"
-
-namespace FlexFlow::Kernels::Combine {
-
-template <DataType DT>
-struct CPUForwardKernel {
-  void operator()(GenericTensorAccessorR const &input,
-                  GenericTensorAccessorW const &output) {
-    memcpy(output.get<DT>(),
-           input.get<DT>(),
-           input.shape.num_elements().int_from_positive_int() *
-               size_of_datatype(DT).int_from_positive_int());
-  }
-};
-
-template <DataType DT>
-struct CPUBackwardKernel {
-  void operator()(GenericTensorAccessorR const &output_grad,
-                  GenericTensorAccessorW const &input_grad) {
-    size_t num_elements =
-        output_grad.shape.num_elements().int_from_positive_int();
-    for (int i = 0; i < num_elements; ++i) {
-      input_grad.get<DT>()[i] += output_grad.get<DT>()[i];
-    }
-  }
-};
-
-void cpu_forward_kernel(GenericTensorAccessorR const &input,
-                        GenericTensorAccessorW const &output) {
-  DataTypeDispatch1<CPUForwardKernel>{}(input.data_type, input, output);
-}
-
-void cpu_backward_kernel(GenericTensorAccessorR const &output_grad,
-                         GenericTensorAccessorW const &input_grad) {
-  DataTypeDispatch1<CPUBackwardKernel>{}(
-      input_grad.data_type, output_grad, input_grad);
-}
-
-} // namespace FlexFlow::Kernels::Combine
diff --git a/lib/kernels/src/cpu/ops/initializer_kernels.cc b/lib/kernels/src/cpu/ops/initializer_kernels.cc
index c7f43b5762..94eac25fdf 100644
--- a/lib/kernels/src/cpu/ops/initializer_kernels.cc
+++ b/lib/kernels/src/cpu/ops/initializer_kernels.cc
@@ -9,14 +9,14 @@ template <DataType DT>
 struct ZeroInitKernel {
   void operator()(GenericTensorAccessorW const &tensor) const {
     auto arr = get<DT>(tensor);
-    for (size_t i = 0; i < get_num_elements(tensor.shape); i++) {
+    for (size_t i = 0; i < get_num_elements(tensor.shape.dims); i++) {
       arr[i] = 0.0f;
     }
   }
 };
 
 void zero_init_kernel_cpu(GenericTensorAccessorW const &tensor) {
-  DataTypeDispatch1<ZeroInitKernel>{}(tensor.data_type, tensor);
+  DataTypeDispatch1<ZeroInitKernel>{}(tensor.shape.data_type, tensor);
 }
 
 template <DataType DT>
@@ -25,7 +25,7 @@ struct ConstantInitKernel {
                   DataTypeValue value) const {
     auto arr = get<DT>(tensor);
     auto unwrapped_value = value.get<real_type_t<DT>>();
-    for (size_t i = 0; i < get_num_elements(tensor.shape); i++) {
+    for (size_t i = 0; i < get_num_elements(tensor.shape.dims); i++) {
       arr[i] = unwrapped_value;
     }
   }
@@ -33,7 +33,8 @@ struct ConstantInitKernel {
 
 void constant_init_kernel_cpu(GenericTensorAccessorW const &tensor,
                               DataTypeValue value) {
-  DataTypeDispatch1<ConstantInitKernel>{}(tensor.data_type, tensor, value);
+  DataTypeDispatch1<ConstantInitKernel>{}(
+      tensor.shape.data_type, tensor, value);
 }
 
 void zero_init_kernel(TaskLocation const &loc,
diff --git a/lib/kernels/src/cpu/ops/replicate_kernels.cc b/lib/kernels/src/cpu/ops/replicate_kernels.cc
deleted file mode 100644
index bc9c4eab0d..0000000000
--- a/lib/kernels/src/cpu/ops/replicate_kernels.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-#include "kernels/datatype_dispatch.h"
-#include "kernels/replicate_kernels_cpu.h"
-#include "utils/nonnegative_int/nonnegative_range.h"
-
-namespace FlexFlow::Kernels::Replicate {
-
-template <DataType DT>
-struct CPUForwardKernel {
-  void operator()(GenericTensorAccessorR const &input,
-                  GenericTensorAccessorW &output) {
-    memcpy(output.get<DT>(),
-           input.get<DT>(),
-           input.shape.num_elements().int_from_positive_int() *
-               size_of_datatype(DT).int_from_positive_int());
-  }
-};
-
-template <DataType DT>
-struct CPUBackwardKernel {
-  void operator()(GenericTensorAccessorR const &output,
-                  GenericTensorAccessorW &input,
-                  positive_int num_elements,
-                  nonnegative_int num_replicas) {
-    using T = real_type_t<DT>;
-
-    for (nonnegative_int i :
-         nonnegative_range(num_elements.nonnegative_int_from_positive_int())) {
-      T cur_sum = 0;
-      for (nonnegative_int replica_idx : nonnegative_range(num_replicas)) {
-        cur_sum += output.at<DT>(LegionOrdered{replica_idx, i});
-      }
-      input.at<DT>(LegionOrdered{i}) = cur_sum;
-    }
-  }
-};
-
-void cpu_forward_kernel(GenericTensorAccessorR const &input,
-                        GenericTensorAccessorW &output) {
-  DataTypeDispatch1<CPUForwardKernel>{}(input.data_type, input, output);
-}
-
-void cpu_backward_kernel(GenericTensorAccessorR const &output,
-                         GenericTensorAccessorW &input,
-                         size_t num_replicas) {
-  positive_int num_elements = input.shape.num_elements();
-  DataTypeDispatch1<CPUBackwardKernel>{}(input.data_type,
-                                         output,
-                                         input,
-                                         num_elements,
-                                         nonnegative_int{num_replicas});
-}
-
-} // namespace FlexFlow::Kernels::Replicate
diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu
index 98faadf5ac..cd89945579 100644
--- a/lib/kernels/src/cuda/cuda_helper.cu
+++ b/lib/kernels/src/cuda/cuda_helper.cu
@@ -1,5 +1,6 @@
 #include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
+#include "op-attrs/tensor_dims.h"
 #include "utils/containers/reversed.h"
 
 namespace FlexFlow {
@@ -217,31 +218,44 @@ __host__ void
   checkCUDA(cudaFreeHost(host_ptr));
 }
 
-ffStatus_t
-    cudnnSetTensorDescriptorFromArrayShape(cudnnTensorDescriptor_t tensor,
-                                           ArrayShape const &shape) {
-  return cudnnSetTensor4dDescriptor(
-      tensor,
-      CUDNN_TENSOR_NCHW,
-      CUDNN_DATA_FLOAT,
-      shape.at_maybe(legion_dim_t{0_n}).value_or(1_p).int_from_positive_int(),
-      shape.at_maybe(legion_dim_t{1_n}).value_or(1_p).int_from_positive_int(),
-      shape.at_maybe(legion_dim_t{2_n}).value_or(1_p).int_from_positive_int(),
-      shape.at_maybe(legion_dim_t{3_n}).value_or(1_p).int_from_positive_int());
-}
-
-cudnnDataType_t ff_to_cudnn_datatype(DataType type) {
-  switch (type) {
+ffCudnnDataType_t ff_to_cudnn_datatype(DataType flexflow_data_type) {
+  switch (flexflow_data_type) {
+    case DataType::BOOL:
+      return CUDNN_DATA_BOOLEAN;
+    case DataType::INT32:
+      return CUDNN_DATA_INT32;
+    case DataType::INT64:
+      return CUDNN_DATA_INT64;
+    case DataType::HALF:
+      return CUDNN_DATA_HALF;
     case DataType::FLOAT:
       return CUDNN_DATA_FLOAT;
     case DataType::DOUBLE:
       return CUDNN_DATA_DOUBLE;
-    case DataType::INT32:
-      return CUDNN_DATA_INT32;
     default:
-      assert(false && "Unsupported cudnn data type");
+      PANIC("Unhandled DataType value", flexflow_data_type);
   }
-  return CUDNN_DATA_FLOAT;
+}
+
+ffStatus_t
+    cudnnSetTensorDescriptorFromTensorShape(cudnnTensorDescriptor_t tensor,
+                                            TensorShape const &shape) {
+  return cudnnSetTensor4dDescriptor(
+      tensor,
+      CUDNN_TENSOR_NCHW,
+      ff_to_cudnn_datatype(shape.data_type),
+      try_dim_at_idx(shape.dims, relative_ff_dim_t{3})
+          .value_or(1_p)
+          .int_from_positive_int(),
+      try_dim_at_idx(shape.dims, relative_ff_dim_t{3})
+          .value_or(1_p)
+          .int_from_positive_int(),
+      try_dim_at_idx(shape.dims, relative_ff_dim_t{3})
+          .value_or(1_p)
+          .int_from_positive_int(),
+      try_dim_at_idx(shape.dims, relative_ff_dim_t{3})
+          .value_or(1_p)
+          .int_from_positive_int());
 }
 
 cudaDataType_t ff_to_cuda_datatype(DataType type) {
diff --git a/lib/kernels/src/cuda/embedding_kernels.cu b/lib/kernels/src/cuda/embedding_kernels.cu
index a7e28c6297..be6bcb4ffc 100644
--- a/lib/kernels/src/cuda/embedding_kernels.cu
+++ b/lib/kernels/src/cuda/embedding_kernels.cu
@@ -15,10 +15,17 @@
 
 #include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
-#include "kernels/embedding_kernels.h"
+#include "kernels/embedding_kernels_gpu.h"
 
 namespace FlexFlow::Kernels::Embedding {
 
+template <typename TD>
+__global__ void rand_generate_int(TD *ptr, size_t size, TD p) {
+  CUDA_KERNEL_LOOP(i, size) {
+    ptr[i] = i % p;
+  }
+}
+
 void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
@@ -309,13 +316,6 @@ __global__ void embed_backward_with_aggr<half>(int64_t const *input,
   }
 }
 
-template <typename TD>
-__global__ void rand_generate_int(TD *ptr, size_t size, TD p) {
-  CUDA_KERNEL_LOOP(i, size) {
-    ptr[i] = i % p;
-  }
-}
-
 template <DataType TI, DataType TD>
 struct ForwardKernel {
   void operator()(cudaStream_t stream,
@@ -343,7 +343,8 @@ struct ForwardKernel<DataType::INT32, DataType::FLOAT> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_forward_no_aggr<float>
-          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
+          <<<GET_BLOCKS(
+                 get_num_elements(output.shape.dims).int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -354,7 +355,8 @@ struct ForwardKernel<DataType::INT32, DataType::FLOAT> {
     } else {
       assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
       embed_forward_with_aggr<float>
-          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
+          <<<GET_BLOCKS(
+                 get_num_elements(output.shape.dims).int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -380,7 +382,8 @@ struct ForwardKernel<DataType::INT32, DataType::HALF> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_forward_no_aggr<half>
-          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
+          <<<GET_BLOCKS(
+                 get_num_elements(output.shape.dims).int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -391,7 +394,8 @@ struct ForwardKernel<DataType::INT32, DataType::HALF> {
     } else {
       assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
       embed_forward_with_aggr<half>
-          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
+          <<<GET_BLOCKS(
+                 get_num_elements(output.shape.dims).int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -417,7 +421,8 @@ struct ForwardKernel<DataType::INT32, DataType::DOUBLE> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_forward_no_aggr<double>
-          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
+          <<<GET_BLOCKS(
+                 get_num_elements(output.shape.dims).int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -428,7 +433,8 @@ struct ForwardKernel<DataType::INT32, DataType::DOUBLE> {
     } else {
       assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
       embed_forward_with_aggr<double>
-          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
+          <<<GET_BLOCKS(
+                 get_num_elements(output.shape.dims).int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -454,7 +460,8 @@ struct ForwardKernel<DataType::INT64, DataType::FLOAT> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_forward_no_aggr<float>
-          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
+          <<<GET_BLOCKS(
+                 get_num_elements(output.shape.dims).int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -465,7 +472,8 @@ struct ForwardKernel<DataType::INT64, DataType::FLOAT> {
     } else {
       assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
       embed_forward_with_aggr<float>
-          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
+          <<<GET_BLOCKS(
+                 get_num_elements(output.shape.dims).int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -491,7 +499,8 @@ struct ForwardKernel<DataType::INT64, DataType::HALF> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_forward_no_aggr<half>
-          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
+          <<<GET_BLOCKS(
+                 get_num_elements(output.shape.dims).int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -502,7 +511,8 @@ struct ForwardKernel<DataType::INT64, DataType::HALF> {
     } else {
       assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
       embed_forward_with_aggr<half>
-          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
+          <<<GET_BLOCKS(
+                 get_num_elements(output.shape.dims).int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -528,7 +538,8 @@ struct ForwardKernel<DataType::INT64, DataType::DOUBLE> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_forward_no_aggr<double>
-          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
+          <<<GET_BLOCKS(
+                 get_num_elements(output.shape.dims).int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -539,7 +550,8 @@ struct ForwardKernel<DataType::INT64, DataType::DOUBLE> {
     } else {
       assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM);
       embed_forward_with_aggr<double>
-          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
+          <<<GET_BLOCKS(
+                 get_num_elements(output.shape.dims).int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -580,7 +592,8 @@ struct BackwardKernel<DataType::INT32, DataType::FLOAT> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_backward_no_aggr<float>
-          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
+          <<<GET_BLOCKS(
+                 get_num_elements(output.shape.dims).int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -590,7 +603,8 @@ struct BackwardKernel<DataType::INT32, DataType::FLOAT> {
                        batch_size);
     } else {
       embed_backward_with_aggr<float>
-          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
+          <<<GET_BLOCKS(
+                 get_num_elements(output.shape.dims).int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -616,7 +630,8 @@ struct BackwardKernel<DataType::INT32, DataType::DOUBLE> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_backward_no_aggr<double>
-          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
+          <<<GET_BLOCKS(
+                 get_num_elements(output.shape.dims).int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -626,7 +641,8 @@ struct BackwardKernel<DataType::INT32, DataType::DOUBLE> {
                        batch_size);
     } else {
       embed_backward_with_aggr<double>
-          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
+          <<<GET_BLOCKS(
+                 get_num_elements(output.shape.dims).int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -652,7 +668,8 @@ struct BackwardKernel<DataType::INT32, DataType::HALF> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_backward_no_aggr<half>
-          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
+          <<<GET_BLOCKS(
+                 get_num_elements(output.shape.dims).int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -662,7 +679,8 @@ struct BackwardKernel<DataType::INT32, DataType::HALF> {
                        batch_size);
     } else {
       embed_backward_with_aggr<half>
-          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
+          <<<GET_BLOCKS(
+                 get_num_elements(output.shape.dims).int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT32>(),
@@ -688,7 +706,8 @@ struct BackwardKernel<DataType::INT64, DataType::FLOAT> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_backward_no_aggr<float>
-          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
+          <<<GET_BLOCKS(
+                 get_num_elements(output.shape.dims).int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -698,7 +717,8 @@ struct BackwardKernel<DataType::INT64, DataType::FLOAT> {
                        batch_size);
     } else {
       embed_backward_with_aggr<float>
-          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
+          <<<GET_BLOCKS(
+                 get_num_elements(output.shape.dims).int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -724,7 +744,8 @@ struct BackwardKernel<DataType::INT64, DataType::DOUBLE> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_backward_no_aggr<double>
-          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
+          <<<GET_BLOCKS(
+                 get_num_elements(output.shape.dims).int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -734,7 +755,8 @@ struct BackwardKernel<DataType::INT64, DataType::DOUBLE> {
                        batch_size);
     } else {
       embed_backward_with_aggr<double>
-          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
+          <<<GET_BLOCKS(
+                 get_num_elements(output.shape.dims).int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -760,7 +782,8 @@ struct BackwardKernel<DataType::INT64, DataType::HALF> {
                   int batch_size) {
     if (!aggr.has_value()) {
       embed_backward_no_aggr<half>
-          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
+          <<<GET_BLOCKS(
+                 get_num_elements(output.shape.dims).int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -770,7 +793,8 @@ struct BackwardKernel<DataType::INT64, DataType::HALF> {
                        batch_size);
     } else {
       embed_backward_with_aggr<half>
-          <<<GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
+          <<<GET_BLOCKS(
+                 get_num_elements(output.shape.dims).int_from_positive_int()),
              CUDA_NUM_THREADS,
              0,
              stream>>>(input.get<DataType::INT64>(),
@@ -784,16 +808,16 @@ struct BackwardKernel<DataType::INT64, DataType::HALF> {
   }
 };
 
-void forward_kernel(ffStream_t stream,
-                    GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output,
-                    GenericTensorAccessorR const &weight,
-                    DataType input_data_type,
-                    DataType output_data_type,
-                    std::optional<AggregateOp> aggr,
-                    int in_dim,
-                    int out_dim,
-                    int batch_size) {
+void gpu_forward_kernel(ffStream_t stream,
+                        GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output,
+                        GenericTensorAccessorR const &weight,
+                        DataType input_data_type,
+                        DataType output_data_type,
+                        std::optional<AggregateOp> aggr,
+                        int in_dim,
+                        int out_dim,
+                        int batch_size) {
   DataTypeDispatch2<ForwardKernel>{}(input_data_type,
                                      output_data_type,
                                      stream,
@@ -806,16 +830,16 @@ void forward_kernel(ffStream_t stream,
                                      batch_size);
 }
 
-void backward_kernel(cudaStream_t stream,
-                     GenericTensorAccessorR const &output,
-                     GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &weight_grad,
-                     DataType output_data_type,
-                     DataType input_data_type,
-                     std::optional<AggregateOp> aggr,
-                     int in_dim,
-                     int out_dim,
-                     int batch_size) {
+void gpu_backward_kernel(cudaStream_t stream,
+                         GenericTensorAccessorR const &output,
+                         GenericTensorAccessorR const &input,
+                         GenericTensorAccessorW const &weight_grad,
+                         DataType output_data_type,
+                         DataType input_data_type,
+                         std::optional<AggregateOp> aggr,
+                         int in_dim,
+                         int out_dim,
+                         int batch_size) {
   DataTypeDispatch2<BackwardKernel>{}(output_data_type,
                                       input_data_type,
                                       stream,
diff --git a/lib/kernels/src/cuda/loss_function_kernels.cu b/lib/kernels/src/cuda/loss_function_kernels.cu
index 2fccf4b48f..a98aa0ccda 100644
--- a/lib/kernels/src/cuda/loss_function_kernels.cu
+++ b/lib/kernels/src/cuda/loss_function_kernels.cu
@@ -14,7 +14,7 @@
  */
 
 #include "internal/device.h"
-#include "kernels/loss_function_kernels.h"
+#include "kernels/loss_function_kernels_gpu.h"
 
 namespace FlexFlow {
 
@@ -56,7 +56,7 @@ __global__ void identity_loss_backward(float *loss_grad,
   }
 }
 
-void sparse_categorical_crossentropy_loss_backward_kernel(
+void sparse_categorical_crossentropy_loss_backward_gpu_kernel(
     cudaStream_t stream,
     float *logit_grad_ptr,
     float const *logit_ptr,
@@ -83,13 +83,13 @@ void sparse_categorical_crossentropy_loss_backward_kernel(
       logit_grad_ptr, logit_grad_volume, 0, scale_factor * k);
 }
 
-void categorical_crossentropy_loss_backward_kernel(cudaStream_t stream,
-                                                   float *logit_grad_ptr,
-                                                   float const *logit_ptr,
-                                                   float const *label_ptr,
-                                                   size_t logit_volume,
-                                                   size_t logit_grad_volume,
-                                                   float scale_factor) {
+void categorical_crossentropy_loss_backward_gpu_kernel(cudaStream_t stream,
+                                                       float *logit_grad_ptr,
+                                                       float const *logit_ptr,
+                                                       float const *label_ptr,
+                                                       size_t logit_volume,
+                                                       size_t logit_grad_volume,
+                                                       float scale_factor) {
   // cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   categorical_crossentropy_loss_backward<<<GET_BLOCKS(logit_volume),
@@ -102,13 +102,13 @@ void categorical_crossentropy_loss_backward_kernel(cudaStream_t stream,
       logit_grad_ptr, logit_grad_volume, 0, scale_factor);
 }
 
-void mean_squared_error_avg_loss_backward_kernel(cudaStream_t stream,
-                                                 float *logit_grad_ptr,
-                                                 float const *logit_ptr,
-                                                 float const *label_ptr,
-                                                 size_t logit_volume,
-                                                 size_t logit_grad_volume,
-                                                 float scale_factor) {
+void mean_squared_error_avg_loss_backward_gpu_kernel(cudaStream_t stream,
+                                                     float *logit_grad_ptr,
+                                                     float const *logit_ptr,
+                                                     float const *label_ptr,
+                                                     size_t logit_volume,
+                                                     size_t logit_grad_volume,
+                                                     float scale_factor) {
   // cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   mean_squared_error_avg_loss_backward<<<GET_BLOCKS(logit_volume),
@@ -121,12 +121,12 @@ void mean_squared_error_avg_loss_backward_kernel(cudaStream_t stream,
       logit_grad_ptr, logit_grad_volume, 0, scale_factor);
 }
 
-void identity_loss_backward_kernel(cudaStream_t stream,
-                                   float *loss_grad_ptr,
-                                   float const *loss_ptr,
-                                   size_t loss_volume,
-                                   size_t loss_grad_volume,
-                                   float scale_factor) {
+void identity_loss_backward_gpu_kernel(cudaStream_t stream,
+                                       float *loss_grad_ptr,
+                                       float const *loss_ptr,
+                                       size_t loss_volume,
+                                       size_t loss_grad_volume,
+                                       float scale_factor) {
   // cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   identity_loss_backward<<<GET_BLOCKS(loss_volume),
diff --git a/lib/kernels/src/cuda/ops/attention_kernels.cu b/lib/kernels/src/cuda/ops/attention_kernels.cu
index e5bdb6f21d..a5fd9ea648 100644
--- a/lib/kernels/src/cuda/ops/attention_kernels.cu
+++ b/lib/kernels/src/cuda/ops/attention_kernels.cu
@@ -14,62 +14,25 @@
  */
 
 #include "internal/device.h"
-#include "kernels/attention_kernels.h"
+#include "kernels/attention_kernels_gpu.h"
 #include "kernels/device.h"
 
-namespace FlexFlow {
+namespace FlexFlow::Kernels::MultiHeadAttention {
 
-bool MHAPerDeviceState::operator==(MHAPerDeviceState const &other) const {
-  return this->tie() == other.tie();
-}
-
-bool MHAPerDeviceState::operator!=(MHAPerDeviceState const &other) const {
-  return this->tie() != other.tie();
-}
-
-std::
-    tuple<PerDeviceFFHandle const &, size_t const &, size_t const &, ffAttnDescriptor_t const &, ffSeqDataDescriptor_t const &, ffSeqDataDescriptor_t const &, ffSeqDataDescriptor_t const &, ffSeqDataDescriptor_t const &, int *const &, int *const &, int *const &, int *const &, void *const &, >
-    MHAPerDeviceState::tie() const {
-  return std::tie(this->handle,
-                  this->weightSize,
-                  this->reserveSpaceSize,
-                  this->attnDesc,
-                  this->qDesc,
-                  this->kDesc,
-                  this->vDesc,
-                  this->oDesc,
-                  this->devQoSeqArray,
-                  this->devKvSeqArray,
-                  this->loWinIdx,
-                  this->hiWinIdx,
-                  this->reserveSpace);
-}
-
-std::string format_as(MHAPerDeviceState const &x) {
-  return fmt::format("MHAPerDeviceState");
-}
-
-std::ostream &operator<<(std::ostream &s, MHAPerDeviceState const &x) {
-  return (s << fmt::to_string(x));
-}
-
-namespace Kernels {
-namespace MultiHeadAttention {
-
-MHAPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
-                              Allocator &allocator,
-                              int num_samples,
-                              int num_heads,
-                              int qSize,
-                              int kSize,
-                              int vSize,
-                              int qProjSize,
-                              int kProjSize,
-                              int vProjSize,
-                              int oProjSize,
-                              int qoSeqLength,
-                              int kvSeqLength,
-                              bool add_bias_kv) {
+MHAPerDeviceState gpu_init_kernel(PerDeviceFFHandle const &handle,
+                                  Allocator &allocator,
+                                  int num_samples,
+                                  int num_heads,
+                                  int qSize,
+                                  int kSize,
+                                  int vSize,
+                                  int qProjSize,
+                                  int kProjSize,
+                                  int vProjSize,
+                                  int oProjSize,
+                                  int qoSeqLength,
+                                  int kvSeqLength,
+                                  bool add_bias_kv) {
   cudaStream_t stream;
   ffAttnDescriptor_t attnDesc;
   ffSeqDataDescriptor_t qDesc;
@@ -225,31 +188,33 @@ MHAPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
     hiWinIdx[i] = kvSeqLength;
   }
 
-  MHAPerDeviceState per_device_state = {handle,
-                                        weightSize,
-                                        reserveSpaceSize,
-                                        attnDesc,
-                                        qDesc,
-                                        kDesc,
-                                        vDesc,
-                                        oDesc,
-                                        devQoSeqArray,
-                                        devKvSeqArray,
-                                        loWinIdx,
-                                        hiWinIdx,
-                                        reserveSpace,
-                                        allocator};
+  MHAPerDeviceState per_device_state = MHAPerDeviceState{
+      /*handle=*/handle,
+      /*weightSize=*/weightSize,
+      /*reserveSpaceSize=*/reserveSpaceSize,
+      /*attnDesc=*/attnDesc,
+      /*qDesc=*/qDesc,
+      /*kDesc=*/kDesc,
+      /*vDesc=*/vDesc,
+      /*oDesc=*/oDesc,
+      /*devQoSeqArray=*/devQoSeqArray,
+      /*devKvSeqArray=*/devKvSeqArray,
+      /*loWinIdx=*/loWinIdx,
+      /*hiWinIdx=*/hiWinIdx,
+      /*reserveSpace=*/reserveSpace,
+      /*allocator=*/allocator,
+  };
 
   return per_device_state;
 }
 
-void forward_kernel(cudaStream_t stream,
-                    MHAPerDeviceState const &device_state,
-                    float const *query_ptr,
-                    float const *key_ptr,
-                    float const *value_ptr,
-                    float const *weight_ptr,
-                    float *output_ptr) {
+void gpu_forward_kernel(cudaStream_t stream,
+                        MHAPerDeviceState const &device_state,
+                        float const *query_ptr,
+                        float const *key_ptr,
+                        float const *value_ptr,
+                        float const *weight_ptr,
+                        float *output_ptr) {
   checkCUDNN(cudnnSetStream(device_state.handle.dnn, stream));
 
   checkCUDNN(cudnnMultiHeadAttnForward(device_state.handle.dnn,
@@ -276,17 +241,17 @@ void forward_kernel(cudaStream_t stream,
                                        device_state.reserveSpace));
 }
 
-void backward_kernel(cudaStream_t stream,
-                     MHAPerDeviceState const &device_state,
-                     float const *query_ptr,
-                     float *query_grad_ptr,
-                     float const *key_ptr,
-                     float *key_grad_ptr,
-                     float const *value_ptr,
-                     float *value_grad_ptr,
-                     float const *weight_ptr,
-                     float *weight_grad_ptr,
-                     float const *output_grad_ptr) {
+void gpu_backward_kernel(cudaStream_t stream,
+                         MHAPerDeviceState const &device_state,
+                         float const *query_ptr,
+                         float *query_grad_ptr,
+                         float const *key_ptr,
+                         float *key_grad_ptr,
+                         float const *value_ptr,
+                         float *value_grad_ptr,
+                         float const *weight_ptr,
+                         float *weight_grad_ptr,
+                         float const *output_grad_ptr) {
   checkCUDNN(cudnnSetStream(device_state.handle.dnn, stream));
 
   checkCUDNN(cudnnMultiHeadAttnBackwardData(device_state.handle.dnn,
@@ -333,8 +298,8 @@ void backward_kernel(cudaStream_t stream,
                                         device_state.reserveSpace));
 }
 
-void cleanup_kernel(Allocator &allocator,
-                    MHAPerDeviceState const &device_state) {
+void gpu_cleanup_kernel(Allocator &allocator,
+                        MHAPerDeviceState const &device_state) {
   free(device_state.loWinIdx);
   free(device_state.hiWinIdx);
   checkCUDNN(cudnnDestroyAttnDescriptor(device_state.attnDesc));
@@ -344,6 +309,4 @@ void cleanup_kernel(Allocator &allocator,
   checkCUDNN(cudnnDestroySeqDataDescriptor(device_state.oDesc));
 }
 
-} // namespace MultiHeadAttention
-} // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow::Kernels::MultiHeadAttention
diff --git a/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu b/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu
index 348eed9f0c..39f5beea21 100644
--- a/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu
+++ b/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu
@@ -14,24 +14,24 @@
  */
 
 #include "internal/device.h"
-#include "kernels/batch_matmul_kernels.h"
+#include "kernels/batch_matmul_kernels_gpu.h"
 
 namespace FlexFlow {
 namespace Kernels {
 namespace BatchMatmul {
 
-void forward_kernel(cudaStream_t stream,
-                    PerDeviceFFHandle const &handle,
-                    float *output_ptr,
-                    float const *a_input_ptr,
-                    float const *b_input_ptr,
-                    int m,
-                    int n,
-                    int k,
-                    int batch,
-                    int a_seq_length_dim,
-                    int b_seq_length_dim,
-                    int seq_length) {
+void gpu_forward_kernel(cudaStream_t stream,
+                        PerDeviceFFHandle const &handle,
+                        float *output_ptr,
+                        float const *a_input_ptr,
+                        float const *b_input_ptr,
+                        int m,
+                        int n,
+                        int k,
+                        int batch,
+                        int a_seq_length_dim,
+                        int b_seq_length_dim,
+                        int seq_length) {
   checkCUBLAS(cublasSetStream(handle.blas, stream));
   checkCUDNN(cudnnSetStream(handle.dnn, stream));
   int lda = k;
@@ -83,18 +83,18 @@ void forward_kernel(cudaStream_t stream,
                                         batch));
 }
 
-void backward_kernel(cudaStream_t stream,
-                     PerDeviceFFHandle const &handle,
-                     float const *o_ptr,
-                     float const *o_grad_ptr,
-                     float const *a_ptr,
-                     float *a_grad_ptr,
-                     float const *b_ptr,
-                     float *b_grad_ptr,
-                     int m,
-                     int n,
-                     int k,
-                     int batch) {
+void gpu_backward_kernel(cudaStream_t stream,
+                         PerDeviceFFHandle const &handle,
+                         float const *o_ptr,
+                         float const *o_grad_ptr,
+                         float const *a_ptr,
+                         float *a_grad_ptr,
+                         float const *b_ptr,
+                         float *b_grad_ptr,
+                         int m,
+                         int n,
+                         int k,
+                         int batch) {
   checkCUBLAS(cublasSetStream(handle.blas, stream));
   checkCUDNN(cudnnSetStream(handle.dnn, stream));
 
diff --git a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu
index ceb3a1b3d9..26234fd6e0 100644
--- a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu
+++ b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu
@@ -23,12 +23,12 @@ namespace FlexFlow {
 namespace Kernels {
 namespace BatchNorm {
 
-void forward_kernel(cudaStream_t stream,
-                    BatchNormPerDeviceState const &m,
-                    float const *input_ptr,
-                    float *output_ptr,
-                    float const *scale_ptr,
-                    float const *bias_ptr) {
+void gpu_forward_kernel(cudaStream_t stream,
+                        BatchNormPerDeviceState const &m,
+                        float const *input_ptr,
+                        float *output_ptr,
+                        float const *scale_ptr,
+                        float const *bias_ptr) {
   checkCUDNN(cudnnSetStream(m.handle.dnn, stream));
 
   float alpha = 1.0f, beta = 0.0f;
@@ -51,16 +51,16 @@ void forward_kernel(cudaStream_t stream,
                                                     m.saveVar));
 }
 
-void backward_kernel(cudaStream_t stream,
-                     BatchNormPerDeviceState const &m,
-                     float const *output_ptr,
-                     float *output_grad_ptr,
-                     float const *input_ptr,
-                     float *input_grad_ptr,
-                     float const *scale_ptr,
-                     float *scale_grad_ptr,
-                     float *bias_grad_ptr,
-                     size_t numElements) {
+void gpu_backward_kernel(cudaStream_t stream,
+                         BatchNormPerDeviceState const &m,
+                         float const *output_ptr,
+                         float *output_grad_ptr,
+                         float const *input_ptr,
+                         float *input_grad_ptr,
+                         float const *scale_ptr,
+                         float *scale_grad_ptr,
+                         float *bias_grad_ptr,
+                         size_t numElements) {
   checkCUDNN(cudnnSetStream(m.handle.dnn, stream));
 
   float alpha = 1.0f;
@@ -89,14 +89,14 @@ void backward_kernel(cudaStream_t stream,
                                              m.saveVar));
 }
 
-BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle,
-                                    Allocator allocator,
-                                    float *runningMean,
-                                    int output_n,
-                                    int output_c,
-                                    int output_h,
-                                    int output_w,
-                                    bool relu) {
+BatchNormPerDeviceState gpu_init_kernel(PerDeviceFFHandle const &handle,
+                                        Allocator &allocator,
+                                        float *runningMean,
+                                        int output_n,
+                                        int output_c,
+                                        int output_h,
+                                        int output_w,
+                                        bool relu) {
   ffTensorDescriptor_t inputTensor;
   ffTensorDescriptor_t outputTensor;
   ffTensorDescriptor_t biasTensor;
@@ -167,19 +167,14 @@ BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle,
   return per_device_state;
 }
 
-void cleanup_kernel(Allocator allocator,
-                    ffTensorDescriptor_t inputTensor,
-                    ffTensorDescriptor_t biasTensor,
-                    ffTensorDescriptor_t outputTensor,
-                    ffActivationDescriptor_t actiDesc,
-                    bool relu,
-                    float *runningMean) {
-  allocator.deallocate(runningMean);
-  checkCUDNN(cudnnDestroyTensorDescriptor(inputTensor));
-  checkCUDNN(cudnnDestroyTensorDescriptor(biasTensor));
-  checkCUDNN(cudnnDestroyTensorDescriptor(outputTensor));
-  if (relu) {
-    checkCUDNN(cudnnDestroyActivationDescriptor(actiDesc));
+void gpu_cleanup_kernel(Allocator &allocator,
+                        BatchNormPerDeviceState &per_device_state) {
+  allocator.deallocate(per_device_state.runningMean);
+  checkCUDNN(cudnnDestroyTensorDescriptor(per_device_state.inputTensor));
+  checkCUDNN(cudnnDestroyTensorDescriptor(per_device_state.biasTensor));
+  checkCUDNN(cudnnDestroyTensorDescriptor(per_device_state.outputTensor));
+  if (per_device_state.relu) {
+    checkCUDNN(cudnnDestroyActivationDescriptor(per_device_state.actiDesc));
   }
 }
 
diff --git a/lib/kernels/src/cuda/ops/cast_kernels.cu b/lib/kernels/src/cuda/ops/cast_kernels.cu
index 3de6de9d5e..7e38c7af40 100644
--- a/lib/kernels/src/cuda/ops/cast_kernels.cu
+++ b/lib/kernels/src/cuda/ops/cast_kernels.cu
@@ -14,7 +14,7 @@
  */
 
 #include "internal/device.h"
-#include "kernels/cast_kernels.h"
+#include "kernels/cast_kernels_gpu.h"
 #include "kernels/datatype_dispatch.h"
 
 namespace FlexFlow {
@@ -41,7 +41,7 @@ struct ForwardKernel {
   void operator()(ffStream_t stream,
                   GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &output) {
-    size_t volume = input.shape.num_elements().int_from_positive_int();
+    size_t volume = get_num_elements(input.shape.dims).int_from_positive_int();
     cast_forward<<<GET_BLOCKS(volume), CUDA_NUM_THREADS, 0, stream>>>(
         input.get<IDT>(), output.get<ODT>(), volume);
   }
@@ -52,24 +52,24 @@ struct BackwardKernel {
   void operator()(ffStream_t stream,
                   GenericTensorAccessorR const &output,
                   GenericTensorAccessorW const &input) {
-    size_t volume = output.shape.num_elements().int_from_positive_int();
+    size_t volume = get_num_elements(output.shape.dims).int_from_positive_int();
     cast_backward<<<GET_BLOCKS(volume), CUDA_NUM_THREADS, 0, stream>>>(
         output.get<IDT>(), input.get<ODT>(), volume, cast_to<ODT>(1.0f));
   }
 };
 
-void forward_kernel(ffStream_t stream,
-                    GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output) {
+void gpu_forward_kernel(ffStream_t stream,
+                        GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output) {
   DataTypeDispatch2<ForwardKernel>{}(
-      input.data_type, output.data_type, stream, input, output);
+      input.shape.data_type, output.shape.data_type, stream, input, output);
 }
 
-void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorR const &output,
-                     GenericTensorAccessorW const &input) {
+void gpu_backward_kernel(ffStream_t stream,
+                         GenericTensorAccessorR const &output,
+                         GenericTensorAccessorW const &input) {
   DataTypeDispatch2<BackwardKernel>{}(
-      output.data_type, input.data_type, stream, output, input);
+      output.shape.data_type, input.shape.data_type, stream, output, input);
 }
 
 } // namespace Cast
diff --git a/lib/kernels/src/cuda/ops/combine_kernels.cu b/lib/kernels/src/cuda/ops/combine_kernels.cu
deleted file mode 100644
index f091a69b71..0000000000
--- a/lib/kernels/src/cuda/ops/combine_kernels.cu
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "internal/device.h"
-#include "kernels/accessor.h"
-#include "kernels/combine_kernels.h"
-#include "kernels/datatype_dispatch.h"
-
-namespace FlexFlow {
-namespace Kernels {
-namespace Combine {
-
-template <DataType DT>
-struct ForwardKernel {
-  void operator()(ffStream_t stream,
-                  GenericTensorAccessorR const &input,
-                  GenericTensorAccessorW const &output) {
-    checkCUDA(
-        cudaMemcpyAsync(output.get<DT>(),
-                        input.get<DT>(),
-                        input.shape.num_elements().int_from_positive_int() *
-                            size_of_datatype(DT).int_from_positive_int(),
-                        cudaMemcpyDeviceToDevice,
-                        stream));
-  }
-};
-
-template <DataType DT>
-struct BackwardKernel {
-  void operator()(ffStream_t stream,
-                  GenericTensorAccessorR const &output_grad,
-                  GenericTensorAccessorW const &input_grad) {
-    size_t num_elements =
-        output_grad.shape.num_elements().int_from_positive_int();
-    add_kernel<real_type_t<DT>>
-        <<<GET_BLOCKS(num_elements), CUDA_NUM_THREADS, 0, stream>>>(
-            input_grad.get<DT>(), output_grad.get<DT>(), num_elements);
-  }
-};
-
-void forward_kernel(ffStream_t stream,
-                    GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output) {
-  DataTypeDispatch1<ForwardKernel>{}(input.data_type, stream, input, output);
-}
-
-void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorR const &output_grad,
-                     GenericTensorAccessorW const &input_grad) {
-  DataTypeDispatch1<BackwardKernel>{}(
-      input_grad.data_type, stream, output_grad, input_grad);
-}
-
-} // namespace Combine
-} // namespace Kernels
-} // namespace FlexFlow
diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu
index e7f88bc258..667a7c0b74 100644
--- a/lib/kernels/src/cuda/ops/concat_kernels.cu
+++ b/lib/kernels/src/cuda/ops/concat_kernels.cu
@@ -14,32 +14,26 @@
  */
 
 #include "internal/device.h"
-#include "kernels/concat_kernels.h"
+#include "kernels/concat_kernels_gpu.h"
 #include <cassert>
 
 namespace FlexFlow::Kernels::Concat {
 
-void calc_blk_size(size_t &num_blocks,
-                   size_t &blk_size,
-                   ArrayShape const &shape,
-                   ff_dim_t axis) {
-  legion_dim_t legion_axis = legion_dim_from_ff_dim(axis, shape.num_dims());
-  assert(legion_axis.value < shape.num_dims());
-  if (legion_axis.value == 0_n) {
-    legion_axis.value = 1_n;
-  }
-  blk_size = shape.sub_shape(legion_dim_t{0_n}, legion_axis)
-                 .num_elements()
+static void calc_blk_size(size_t &num_blocks,
+                          size_t &blk_size,
+                          TensorShape const &shape,
+                          ff_dim_t axis) {
+  blk_size = get_num_elements(slice_tensor_dims(shape.dims, axis, std::nullopt))
                  .int_from_positive_int();
-  num_blocks = shape.sub_shape(legion_axis, std::nullopt)
-                   .num_elements()
-                   .int_from_positive_int();
+  num_blocks =
+      get_num_elements(slice_tensor_dims(shape.dims, ff_dim_t{0_n}, axis))
+          .int_from_positive_int();
 }
 
-void forward_kernel(cudaStream_t stream,
-                    GenericTensorAccessorW const &output,
-                    std::vector<GenericTensorAccessorR> const &inputs,
-                    ff_dim_t axis) {
+void gpu_forward_kernel(cudaStream_t stream,
+                        GenericTensorAccessorW const &output,
+                        std::vector<GenericTensorAccessorR> const &inputs,
+                        ff_dim_t axis) {
   assert(inputs.size() <= MAX_NUM_INPUTS);
   size_t num_blocks = 1, output_blk_size = 1;
   calc_blk_size(num_blocks, output_blk_size, output.shape, axis);
@@ -68,10 +62,10 @@ void forward_kernel(cudaStream_t stream,
   }
 }
 
-void backward_kernel(cudaStream_t stream,
-                     GenericTensorAccessorR const &output_grad,
-                     std::vector<GenericTensorAccessorW> const &input_grads,
-                     ff_dim_t axis) {
+void gpu_backward_kernel(cudaStream_t stream,
+                         GenericTensorAccessorR const &output_grad,
+                         std::vector<GenericTensorAccessorW> const &input_grads,
+                         ff_dim_t axis) {
   assert(input_grads.size() <= MAX_NUM_INPUTS);
   size_t num_blocks = 1, output_blk_size = 1;
   calc_blk_size(num_blocks, output_blk_size, output_grad.shape, axis);
diff --git a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu
index 6e446008ed..92046b30ae 100644
--- a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu
+++ b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu
@@ -113,19 +113,20 @@ cudnnConvolutionBwdFilterAlgo_t selectConvolutionBackwardFilterAlgorithm(
   return perfResults[0].algo;
 }
 
-Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle,
-                                 std::optional<Activation> activation,
-                                 int kernel_h,
-                                 int kernel_w,
-                                 int groups,
-                                 int pad_h,
-                                 int pad_w,
-                                 int stride_h,
-                                 int stride_w,
-                                 GenericTensorAccessorW const &input,
-                                 GenericTensorAccessorW const &output,
-                                 float const *filter_ptr,
-                                 float *filter_grad_ptr) {
+Conv2DPerDeviceState
+    gpu_init_kernel(PerDeviceFFHandle const &handle,
+                    std::optional<Activation> const &activation,
+                    int kernel_h,
+                    int kernel_w,
+                    int groups,
+                    int pad_h,
+                    int pad_w,
+                    int stride_h,
+                    int stride_w,
+                    GenericTensorAccessorW const &input,
+                    GenericTensorAccessorW const &output,
+                    float const *filter_ptr,
+                    float *filter_grad_ptr) {
 
   ffTensorDescriptor_t inputTensor;
   ffTensorDescriptor_t biasTensor;
@@ -137,15 +138,23 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle,
   ffConvolutionBwdFilterAlgo_t bwdFilterAlgo;
   ffConvolutionBwdDataAlgo_t bwdDataAlgo;
 
-  int input_w = input.shape.at(legion_dim_t(0_n)).int_from_positive_int();
-  int input_h = input.shape.at(legion_dim_t(1_n)).int_from_positive_int();
-  int input_c = input.shape.at(legion_dim_t(2_n)).int_from_positive_int();
-  int input_n = input.shape.at(legion_dim_t(3_n)).int_from_positive_int();
+  int input_w =
+      dim_at_idx(input.shape.dims, legion_dim_t{0_n}).int_from_positive_int();
+  int input_h =
+      dim_at_idx(input.shape.dims, legion_dim_t{1_n}).int_from_positive_int();
+  int input_c =
+      dim_at_idx(input.shape.dims, legion_dim_t{2_n}).int_from_positive_int();
+  int input_n =
+      dim_at_idx(input.shape.dims, legion_dim_t{3_n}).int_from_positive_int();
 
-  int output_w = output.shape.at(legion_dim_t(0_n)).int_from_positive_int();
-  int output_h = output.shape.at(legion_dim_t(1_n)).int_from_positive_int();
-  int output_c = output.shape.at(legion_dim_t(2_n)).int_from_positive_int();
-  int output_n = output.shape.at(legion_dim_t(3_n)).int_from_positive_int();
+  int output_w =
+      dim_at_idx(output.shape.dims, legion_dim_t{0_n}).int_from_positive_int();
+  int output_h =
+      dim_at_idx(output.shape.dims, legion_dim_t{1_n}).int_from_positive_int();
+  int output_c =
+      dim_at_idx(output.shape.dims, legion_dim_t{2_n}).int_from_positive_int();
+  int output_n =
+      dim_at_idx(output.shape.dims, legion_dim_t{3_n}).int_from_positive_int();
 
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
   checkCUDNN(cudnnCreateTensorDescriptor(&biasTensor));
@@ -154,13 +163,7 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle,
   checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
   checkCUDNN(cudnnCreateActivationDescriptor(&actiDesc));
 
-  checkCUDNN(cudnnSetTensor4dDescriptor(inputTensor,
-                                        CUDNN_TENSOR_NCHW,
-                                        CUDNN_DATA_FLOAT,
-                                        input_n,
-                                        input_c,
-                                        input_h,
-                                        input_w));
+  checkCUDNN(cudnnSetTensorDescriptorFromTensorShape(inputTensor, input.shape));
 
   checkCUDNN(cudnnSetTensor4dDescriptor(
       biasTensor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, output_c, 1, 1));
@@ -253,26 +256,28 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle,
         actiDesc, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0));
   }
 
-  Conv2DPerDeviceState per_device_state = {handle,
-                                           inputTensor,
-                                           biasTensor,
-                                           outputTensor,
-                                           filterDesc,
-                                           actiDesc,
-                                           convDesc,
-                                           fwdAlgo,
-                                           bwdFilterAlgo,
-                                           bwdDataAlgo};
+  Conv2DPerDeviceState per_device_state = Conv2DPerDeviceState{
+      handle,
+      inputTensor,
+      biasTensor,
+      outputTensor,
+      filterDesc,
+      actiDesc,
+      convDesc,
+      fwdAlgo,
+      bwdFilterAlgo,
+      bwdDataAlgo,
+  };
   return per_device_state;
 }
 
-void forward_kernel(ffStream_t stream,
-                    Conv2DPerDeviceState const &m,
-                    float const *input_ptr,
-                    float *output_ptr,
-                    float const *filter_ptr,
-                    float const *bias_ptr,
-                    std::optional<Activation> activation) {
+void gpu_forward_kernel(ffStream_t stream,
+                        Conv2DPerDeviceState const &m,
+                        float const *input_ptr,
+                        float *output_ptr,
+                        float const *filter_ptr,
+                        float const *bias_ptr,
+                        std::optional<Activation> activation) {
   checkCUDNN(cudnnSetStream(m.handle.dnn, stream));
 
   float alpha = 1.0f, beta = 0.0f;
@@ -311,16 +316,16 @@ void forward_kernel(ffStream_t stream,
   }
 }
 
-void backward_kernel(ffStream_t stream,
-                     Conv2DPerDeviceState const &m,
-                     float const *output_ptr,
-                     float *output_grad_ptr,
-                     float const *input_ptr,
-                     float *input_grad_ptr,
-                     float const *filter_ptr,
-                     float *filter_grad_ptr,
-                     float *bias_grad_ptr,
-                     std::optional<Activation> activation) {
+void gpu_backward_kernel(ffStream_t stream,
+                         Conv2DPerDeviceState const &m,
+                         float const *output_ptr,
+                         float *output_grad_ptr,
+                         float const *input_ptr,
+                         float *input_grad_ptr,
+                         float const *filter_ptr,
+                         float *filter_grad_ptr,
+                         float *bias_grad_ptr,
+                         std::optional<Activation> activation) {
   checkCUDNN(cudnnSetStream(m.handle.dnn, stream));
 
   float alpha = 1.0f;
@@ -386,6 +391,10 @@ void backward_kernel(ffStream_t stream,
   }
 }
 
+void gpu_cleanup_kernel(Conv2DPerDeviceState &per_device_state) {
+  NOT_IMPLEMENTED();
+}
+
 } // namespace Conv2D
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/lib/kernels/src/cuda/ops/dropout_kernels.cu b/lib/kernels/src/cuda/ops/dropout_kernels.cu
index c5fa56bc78..fc37696c24 100644
--- a/lib/kernels/src/cuda/ops/dropout_kernels.cu
+++ b/lib/kernels/src/cuda/ops/dropout_kernels.cu
@@ -21,11 +21,11 @@ namespace FlexFlow {
 namespace Kernels {
 namespace Dropout {
 
-DropoutPerDeviceState init_kernel(PerDeviceFFHandle handle,
-                                  float rate,
-                                  unsigned long long seed,
-                                  ArrayShape const &output_shape,
-                                  Allocator allocator) {
+DropoutPerDeviceState gpu_init_kernel(PerDeviceFFHandle const &handle,
+                                      float rate,
+                                      unsigned long long seed,
+                                      TensorShape const &output_shape,
+                                      Allocator &allocator) {
   ffTensorDescriptor_t inputTensor;
   ffTensorDescriptor_t outputTensor;
   ffDropoutDescriptor_t dropoutDesc;
@@ -37,9 +37,10 @@ DropoutPerDeviceState init_kernel(PerDeviceFFHandle handle,
   checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
   checkCUDNN(cudnnCreateDropoutDescriptor(&dropoutDesc));
   checkCUDNN(cudnnDropoutGetStatesSize(handle.dnn, &(dropoutStateSize)));
-  checkCUDNN(cudnnSetTensorDescriptorFromArrayShape(inputTensor, output_shape));
   checkCUDNN(
-      cudnnSetTensorDescriptorFromArrayShape(outputTensor, output_shape));
+      cudnnSetTensorDescriptorFromTensorShape(inputTensor, output_shape));
+  checkCUDNN(
+      cudnnSetTensorDescriptorFromTensorShape(outputTensor, output_shape));
   checkCUDNN(
       cudnnDropoutGetReserveSpaceSize(outputTensor, &(reserveSpaceSize)));
   {
@@ -50,21 +51,23 @@ DropoutPerDeviceState init_kernel(PerDeviceFFHandle handle,
   }
   checkCUDNN(cudnnSetDropoutDescriptor(
       dropoutDesc, handle.dnn, rate, dropoutStates, dropoutStateSize, seed));
-  DropoutPerDeviceState per_device_state = {handle,
-                                            inputTensor,
-                                            outputTensor,
-                                            dropoutDesc,
-                                            reserveSpace,
-                                            dropoutStates,
-                                            reserveSpaceSize,
-                                            dropoutStateSize};
+  DropoutPerDeviceState per_device_state = DropoutPerDeviceState{
+      /*handle=*/handle,
+      /*inputTensor=*/inputTensor,
+      /*outputTensor=*/outputTensor,
+      /*dropoutDesc=*/dropoutDesc,
+      /*reserveSpace=*/reserveSpace,
+      /*dropoutStates=*/dropoutStates,
+      /*reserveSpaceSize=*/reserveSpaceSize,
+      /*dropoutStateSize=*/dropoutStateSize,
+  };
   return per_device_state;
 }
 
-void forward_kernel(cudaStream_t stream,
-                    DropoutPerDeviceState const &m,
-                    float const *input_ptr,
-                    float *output_ptr) {
+void gpu_forward_kernel(cudaStream_t stream,
+                        DropoutPerDeviceState const &m,
+                        float const *input_ptr,
+                        float *output_ptr) {
   checkCUDNN(cudnnSetStream(m.handle.dnn, stream));
 
   checkCUDNN(cudnnDropoutForward(m.handle.dnn,
@@ -77,10 +80,10 @@ void forward_kernel(cudaStream_t stream,
                                  m.reserveSpaceSize));
 }
 
-void backward_kernel(cudaStream_t stream,
-                     DropoutPerDeviceState const &m,
-                     float const *output_grad_ptr,
-                     float *input_grad_ptr) {
+void gpu_backward_kernel(cudaStream_t stream,
+                         DropoutPerDeviceState const &m,
+                         float const *output_grad_ptr,
+                         float *input_grad_ptr) {
   checkCUDNN(cudnnSetStream(m.handle.dnn, stream));
 
   checkCUDNN(cudnnDropoutBackward(m.handle.dnn,
@@ -93,15 +96,12 @@ void backward_kernel(cudaStream_t stream,
                                   m.reserveSpaceSize));
 }
 
-void cleanup_kernel(Allocator allocator,
-                    ffTensorDescriptor_t inputTensor,
-                    ffTensorDescriptor_t outputTensor,
-                    ffDropoutDescriptor_t dropoutDesc,
-                    void *dropoutStates) {
-  allocator.deallocate(dropoutStates);
-  checkCUDNN(cudnnDestroyTensorDescriptor(inputTensor));
-  checkCUDNN(cudnnDestroyTensorDescriptor(outputTensor));
-  checkCUDNN(cudnnDestroyDropoutDescriptor(dropoutDesc));
+void gpu_cleanup_kernel(Allocator &allocator,
+                        DropoutPerDeviceState const &per_device_state) {
+  allocator.deallocate(per_device_state.dropoutStates);
+  checkCUDNN(cudnnDestroyTensorDescriptor(per_device_state.inputTensor));
+  checkCUDNN(cudnnDestroyTensorDescriptor(per_device_state.outputTensor));
+  checkCUDNN(cudnnDestroyDropoutDescriptor(per_device_state.dropoutDesc));
 }
 
 } // namespace Dropout
diff --git a/lib/kernels/src/cuda/ops/element_binary_kernels.cu b/lib/kernels/src/cuda/ops/element_binary_kernels.cu
index 3a4a77b3dd..7e13486429 100644
--- a/lib/kernels/src/cuda/ops/element_binary_kernels.cu
+++ b/lib/kernels/src/cuda/ops/element_binary_kernels.cu
@@ -14,7 +14,7 @@
  */
 
 #include "internal/device.h"
-#include "kernels/element_binary_kernels.h"
+#include "kernels/element_binary_kernels_gpu.h"
 #include "kernels/ff_handle.h"
 #include "op-attrs/datatype.h"
 #include "op-attrs/operator_type.h"
@@ -79,13 +79,13 @@ __global__ void elewise_binary_backward_kernel(size_t volume,
   }
 }
 
-ElementBinaryPerDeviceState init_kernel(PerDeviceFFHandle handle,
-                                        OperatorType op_type,
-                                        bool should_broadcast_lhs,
-                                        bool should_broadcast_rhs,
-                                        ArrayShape lhs_shape,
-                                        ArrayShape rhs_shape,
-                                        ArrayShape output_shape) {
+ElementBinaryPerDeviceState gpu_init_kernel(PerDeviceFFHandle handle,
+                                            OperatorType op_type,
+                                            bool should_broadcast_lhs,
+                                            bool should_broadcast_rhs,
+                                            TensorShape const &lhs_shape,
+                                            TensorShape const &rhs_shape,
+                                            TensorShape const &output_shape) {
   ffTensorDescriptor_t inputLHSTensor;
   ffTensorDescriptor_t inputRHSTensor;
   ffTensorDescriptor_t outputTensor;
@@ -124,28 +124,32 @@ ElementBinaryPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                             CUDNN_PROPAGATE_NAN,
                                             CUDNN_REDUCE_TENSOR_NO_INDICES,
                                             CUDNN_32BIT_INDICES));
-  checkCUDNN(cudnnSetTensorDescriptorFromArrayShape(inputLHSTensor, lhs_shape));
-  checkCUDNN(cudnnSetTensorDescriptorFromArrayShape(inputRHSTensor, rhs_shape));
   checkCUDNN(
-      cudnnSetTensorDescriptorFromArrayShape(outputTensor, output_shape));
+      cudnnSetTensorDescriptorFromTensorShape(inputLHSTensor, lhs_shape));
+  checkCUDNN(
+      cudnnSetTensorDescriptorFromTensorShape(inputRHSTensor, rhs_shape));
+  checkCUDNN(
+      cudnnSetTensorDescriptorFromTensorShape(outputTensor, output_shape));
 
-  ElementBinaryPerDeviceState per_device_state = {handle,
-                                                  inputLHSTensor,
-                                                  inputRHSTensor,
-                                                  outputTensor,
-                                                  opDesc,
-                                                  reduceAddDesc};
+  ElementBinaryPerDeviceState per_device_state = ElementBinaryPerDeviceState{
+      /*handle=*/handle,
+      /*inputLHSTensor=*/inputLHSTensor,
+      /*inputRHSTensor=*/inputRHSTensor,
+      /*outputTensor=*/outputTensor,
+      /*opDesc=*/opDesc,
+      /*reduceAddDesc=*/reduceAddDesc,
+  };
   return per_device_state;
 }
 
-void forward_kernel(cudaStream_t stream,
-                    ElementBinaryPerDeviceState const &m,
-                    float const *lhs_ptr,
-                    float const *rhs_ptr,
-                    float *out_ptr,
-                    OperatorType op_type,
-                    bool broadcast_inputLHS,
-                    PerDeviceFFHandle handle) {
+void gpu_forward_kernel(cudaStream_t stream,
+                        ElementBinaryPerDeviceState const &m,
+                        float const *lhs_ptr,
+                        float const *rhs_ptr,
+                        float *out_ptr,
+                        OperatorType op_type,
+                        bool broadcast_inputLHS,
+                        PerDeviceFFHandle handle) {
   checkCUBLAS(cublasSetStream(handle.blas, stream));
   checkCUDNN(cudnnSetStream(handle.dnn, stream));
   float alpha1 = 1.0f, alpha2 = 1.0f, beta = 0.0f;
@@ -242,17 +246,17 @@ void forward_kernel(cudaStream_t stream,
   }
 }
 
-void backward_kernel(cudaStream_t stream,
-                     ElementBinaryPerDeviceState const &m,
-                     float const *out_grad_ptr,
-                     float const *lhs_ptr,
-                     float const *rhs_ptr,
-                     float *lhs_grad_ptr,
-                     float *rhs_grad_ptr,
-                     OperatorType op_type,
-                     bool broadcast_inputLHS,
-                     bool broadcast_inputRHS,
-                     PerDeviceFFHandle handle) {
+void gpu_backward_kernel(cudaStream_t stream,
+                         ElementBinaryPerDeviceState const &m,
+                         float const *out_grad_ptr,
+                         float const *lhs_ptr,
+                         float const *rhs_ptr,
+                         float *lhs_grad_ptr,
+                         float *rhs_grad_ptr,
+                         OperatorType op_type,
+                         bool broadcast_inputLHS,
+                         bool broadcast_inputRHS,
+                         PerDeviceFFHandle handle) {
   checkCUBLAS(cublasSetStream(handle.blas, stream));
   checkCUDNN(cudnnSetStream(handle.dnn, stream));
 
@@ -421,6 +425,10 @@ void backward_kernel(cudaStream_t stream,
   }
 }
 
+void gpu_cleanup_kernel(ElementBinaryPerDeviceState const &per_device_state) {
+  NOT_IMPLEMENTED();
+}
+
 } // namespace ElementBinary
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/lib/kernels/src/cuda/ops/element_unary_kernels.cu b/lib/kernels/src/cuda/ops/element_unary_kernels.cu
index 21ac95c204..8fdc3ca8ee 100644
--- a/lib/kernels/src/cuda/ops/element_unary_kernels.cu
+++ b/lib/kernels/src/cuda/ops/element_unary_kernels.cu
@@ -15,7 +15,7 @@
 
 #include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
-#include "kernels/element_unary_kernels.h"
+#include "kernels/element_unary_kernels_gpu.h"
 #include "op-attrs/get_op_type.h"
 #include <optional>
 
@@ -48,9 +48,10 @@ static bool use_scalar(OperatorType op_type) {
   }
 }
 
-static ElementUnaryPerDeviceState init_kernel(ArrayShape const &input_shape,
-                                              ArrayShape const &output_shape,
-                                              OperatorType op_type) {
+static ElementUnaryPerDeviceState
+    gpu_init_kernel(TensorShape const &input_shape,
+                    TensorShape const &output_shape,
+                    OperatorType op_type) {
 
   ffTensorDescriptor_t inputTensor;
   ffTensorDescriptor_t outputTensor;
@@ -81,18 +82,22 @@ static ElementUnaryPerDeviceState init_kernel(ArrayShape const &input_shape,
     checkCUDNN(
         cudnnSetActivationDescriptor(actiDesc, mode, CUDNN_PROPAGATE_NAN, 0.0));
     checkCUDNN(
-        cudnnSetTensorDescriptorFromArrayShape(inputTensor, input_shape));
+        cudnnSetTensorDescriptorFromTensorShape(inputTensor, input_shape));
     checkCUDNN(
-        cudnnSetTensorDescriptorFromArrayShape(outputTensor, output_shape));
+        cudnnSetTensorDescriptorFromTensorShape(outputTensor, output_shape));
   }
 
-  return {inputTensor, outputTensor, actiDesc};
+  return ElementUnaryPerDeviceState{
+      /*inputTensor=*/inputTensor,
+      /*outputTensor=*/outputTensor,
+      /*actiDesc=*/actiDesc,
+  };
 }
 
-ElementUnaryPerDeviceState init_kernel(ArrayShape const &input_shape,
-                                       ArrayShape const &output_shape,
-                                       ElementUnaryAttrs const &attrs) {
-  return init_kernel(input_shape, output_shape, get_op_type(attrs));
+ElementUnaryPerDeviceState gpu_init_kernel(TensorShape const &input_shape,
+                                           TensorShape const &output_shape,
+                                           ElementUnaryAttrs const &attrs) {
+  return gpu_init_kernel(input_shape, output_shape, get_op_type(attrs));
 }
 
 template <typename T>
@@ -254,6 +259,10 @@ struct ForwardKernel {
                   GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &output) const {
     checkCUDNN(cudnnSetStream(handle.dnn, stream));
+
+    size_t num_elements =
+        get_num_elements(input.shape.dims).int_from_positive_int();
+
     if (use_cudnn(op_type)) {
       float alpha = 1.0f, beta = 0.0f;
       checkCUDNN(cudnnActivationForward(handle.dnn,
@@ -266,7 +275,6 @@ struct ForwardKernel {
                                         output.get<T>()));
     } else if (use_scalar(op_type)) {
       assert(scalar.has_value());
-      size_t num_elements = input.shape.num_elements().int_from_positive_int();
       elewise_scalar_unary_forward_kernel<real_type_t<T>>
           <<<GET_BLOCKS(num_elements), CUDA_NUM_THREADS, 0, stream>>>(
               num_elements,
@@ -275,7 +283,6 @@ struct ForwardKernel {
               input.get<T>(),
               output.get<T>());
     } else {
-      size_t num_elements = input.shape.num_elements().int_from_positive_int();
       elewise_unary_forward_kernel<real_type_t<T>>
           <<<GET_BLOCKS(num_elements), CUDA_NUM_THREADS, 0, stream>>>(
               num_elements, op_type, input.get<T>(), output.get<T>());
@@ -295,6 +302,8 @@ struct BackwardKernel {
                   GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &input_grad) {
     checkCUDNN(cudnnSetStream(handle.dnn, stream));
+    size_t num_elements =
+        get_num_elements(input.shape.dims).int_from_positive_int();
 
     if (use_cudnn(op_type)) {
       float alpha = 1.0f;
@@ -312,7 +321,6 @@ struct BackwardKernel {
                                          input_grad.get<T>()));
     } else if (use_scalar(op_type)) {
       assert(scalar.has_value());
-      size_t num_elements = input.shape.num_elements().int_from_positive_int();
       elewise_scalar_unary_backward_kernel<real_type_t<T>>
           <<<GET_BLOCKS(num_elements), CUDA_NUM_THREADS, 0, stream>>>(
               num_elements,
@@ -323,7 +331,6 @@ struct BackwardKernel {
               input.get<T>(),
               input_grad.get<T>());
     } else {
-      size_t num_elements = input.shape.num_elements().int_from_positive_int();
       elewise_unary_backward_kernel<real_type_t<T>>
           <<<GET_BLOCKS(num_elements), CUDA_NUM_THREADS, 0, stream>>>(
               num_elements,
@@ -336,13 +343,13 @@ struct BackwardKernel {
   }
 };
 
-void forward_kernel(ffStream_t stream,
-                    ElementUnaryPerDeviceState const &device_state,
-                    ElementUnaryAttrs const &attrs,
-                    PerDeviceFFHandle const &handle,
-                    GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output) {
-  DataTypeDispatch1<ForwardKernel>{}(input.data_type,
+void gpu_forward_kernel(ffStream_t stream,
+                        ElementUnaryPerDeviceState const &device_state,
+                        ElementUnaryAttrs const &attrs,
+                        PerDeviceFFHandle const &handle,
+                        GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output) {
+  DataTypeDispatch1<ForwardKernel>{}(input.shape.data_type,
                                      stream,
                                      device_state,
                                      get_op_type(attrs),
@@ -352,15 +359,15 @@ void forward_kernel(ffStream_t stream,
                                      output);
 }
 
-void backward_kernel(ffStream_t stream,
-                     ElementUnaryPerDeviceState const &device_state,
-                     ElementUnaryAttrs const &attrs,
-                     PerDeviceFFHandle const &handle,
-                     GenericTensorAccessorR const &output,
-                     GenericTensorAccessorR const &output_grad,
-                     GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &input_grad) {
-  DataTypeDispatch1<BackwardKernel>{}(input.data_type,
+void gpu_backward_kernel(ffStream_t stream,
+                         ElementUnaryPerDeviceState const &device_state,
+                         ElementUnaryAttrs const &attrs,
+                         PerDeviceFFHandle const &handle,
+                         GenericTensorAccessorR const &output,
+                         GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorR const &input,
+                         GenericTensorAccessorW const &input_grad) {
+  DataTypeDispatch1<BackwardKernel>{}(input.shape.data_type,
                                       stream,
                                       device_state,
                                       get_op_type(attrs),
@@ -372,6 +379,10 @@ void backward_kernel(ffStream_t stream,
                                       input_grad);
 }
 
+void gpu_cleanup_kernel(ElementUnaryPerDeviceState &per_device_state) {
+  NOT_IMPLEMENTED();
+}
+
 } // namespace ElementUnary
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/lib/kernels/src/cuda/ops/flat_kernels.cu b/lib/kernels/src/cuda/ops/flat_kernels.cu
index 9dee095071..e3495750c2 100644
--- a/lib/kernels/src/cuda/ops/flat_kernels.cu
+++ b/lib/kernels/src/cuda/ops/flat_kernels.cu
@@ -15,37 +15,38 @@
 
 #include "internal/device.h"
 #include "kernels/accessor.h"
-#include "kernels/flat_kernels.h"
+#include "kernels/flat_kernels_gpu.h"
+#include "op-attrs/tensor_shape.h"
 
 namespace FlexFlow {
 namespace Kernels {
 namespace Flat {
 
-void forward_kernel(cudaStream_t stream,
-                    GenericTensorAccessorR input,
-                    float *output_ptr) {
+void gpu_forward_kernel(cudaStream_t stream,
+                        GenericTensorAccessorR const &input,
+                        float *output_ptr) {
 
-  checkCUDA(cudaMemcpyAsync(output_ptr,
-                            input.get_float_ptr(),
-                            input.shape.num_elements().int_from_positive_int() *
-                                sizeof(float),
-                            cudaMemcpyDeviceToDevice,
-                            stream));
+  checkCUDA(cudaMemcpyAsync(
+      output_ptr,
+      input.get_float_ptr(),
+      get_size_in_bytes(input.shape).unwrap_num_bytes().unwrap_nonnegative(),
+      cudaMemcpyDeviceToDevice,
+      stream));
 }
 
-void backward_kernel(cudaStream_t stream,
-                     GenericTensorAccessorR input,
-                     float const *output_grad_ptr,
-                     float *input_grad_ptr) {
+void gpu_backward_kernel(cudaStream_t stream,
+                         GenericTensorAccessorR const &input,
+                         float const *output_grad_ptr,
+                         float *input_grad_ptr) {
 
   float alpha = 1.0f;
   apply_add_with_scale<float>
-      <<<GET_BLOCKS(input.shape.num_elements().int_from_positive_int()),
+      <<<GET_BLOCKS(get_num_elements(input.shape.dims).int_from_positive_int()),
          CUDA_NUM_THREADS,
          0,
          stream>>>(input_grad_ptr,
                    output_grad_ptr,
-                   input.shape.num_elements().int_from_positive_int(),
+                   get_num_elements(input.shape.dims).int_from_positive_int(),
                    alpha);
 }
 
diff --git a/lib/kernels/src/cuda/ops/gather_kernels.cu b/lib/kernels/src/cuda/ops/gather_kernels.cu
index e251a57f8a..7b173fdd5e 100644
--- a/lib/kernels/src/cuda/ops/gather_kernels.cu
+++ b/lib/kernels/src/cuda/ops/gather_kernels.cu
@@ -16,7 +16,8 @@
 #include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
 #include "kernels/device.h"
-#include "kernels/gather_kernels.h"
+#include "kernels/gather_kernels_gpu.h"
+#include "op-attrs/ff_dim_t.h"
 
 namespace FlexFlow::Kernels::Gather {
 
@@ -117,72 +118,84 @@ struct BackwardKernel {
   }
 };
 
-void forward_kernel(ffStream_t stream,
-                    GatherPerDeviceState const &m,
-                    GenericTensorAccessorR const &input,
-                    GenericTensorAccessorR const &index,
-                    GenericTensorAccessorW const &output) {
+GatherPerDeviceState gpu_init_kernel(PerDeviceFFHandle const &handle,
+                                     ff_dim_t dim) {
+  return GatherPerDeviceState{
+      /*handle=*/handle,
+      /*dim=*/dim,
+  };
+}
+
+void gpu_forward_kernel(ffStream_t stream,
+                        GatherPerDeviceState const &m,
+                        GenericTensorAccessorR const &input,
+                        GenericTensorAccessorR const &index,
+                        GenericTensorAccessorW const &output) {
   checkCUDA(get_legion_stream(&stream));
-  coord_t stride =
-      output.shape
-          .sub_shape(legion_dim_t{0_n}, add_to_legion_dim(m.legion_dim, 1))
-          .num_elements()
-          .int_from_positive_int();
-  if (m.legion_dim.value == 0_n) {
+
+  std::optional<coord_t> stride = std::nullopt;
+  if (m.dim.value == 0_n) {
     stride = 1;
+  } else {
+    stride = get_num_elements(slice_tensor_dims(output.shape.dims,
+                                                add_to_ff_dim(m.dim, -1),
+                                                std::nullopt))
+                 .int_from_positive_int();
   }
 
   coord_t output_dim_size =
-      output.shape.at(m.legion_dim).int_from_positive_int();
-  coord_t input_dim_size = input.shape.at(m.legion_dim).int_from_positive_int();
+      dim_at_idx(output.shape.dims, m.dim).int_from_positive_int();
+  coord_t input_dim_size =
+      dim_at_idx(input.shape.dims, m.dim).int_from_positive_int();
 
-  assert(index.data_type == DataType::INT32 ||
-         index.data_type == DataType::INT64);
+  assert(index.shape.data_type == DataType::INT32 ||
+         index.shape.data_type == DataType::INT64);
 
   DataTypeDispatch1<ForwardKernel>{}(
-      index.data_type,
+      index.shape.data_type,
       stream,
       input,
       index,
       output,
-      output.shape.num_elements().int_from_positive_int(),
-      stride,
+      get_num_elements(output.shape.dims).int_from_positive_int(),
+      stride.value(),
       input_dim_size,
       output_dim_size);
 }
 
-void backward_kernel(ffStream_t stream,
-                     GatherPerDeviceState const &m,
-                     GenericTensorAccessorR const &output_grad,
-                     GenericTensorAccessorR const &index,
-                     GenericTensorAccessorW const &input_grad) {
+void gpu_backward_kernel(ffStream_t stream,
+                         GatherPerDeviceState const &m,
+                         GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorR const &index,
+                         GenericTensorAccessorW const &input_grad) {
   checkCUDA(get_legion_stream(&stream));
 
-  coord_t stride =
-      output_grad.shape
-          .sub_shape(legion_dim_t{0_n}, add_to_legion_dim(m.legion_dim, 1))
-          .num_elements()
-          .int_from_positive_int();
-  if (m.legion_dim.value == 0_n) {
+  std::optional<coord_t> stride = std::nullopt;
+  if (m.dim.value == 0_n) {
     stride = 1;
+  } else {
+    stride = get_num_elements(slice_tensor_dims(output_grad.shape.dims,
+                                                add_to_ff_dim(m.dim, -1),
+                                                std::nullopt))
+                 .int_from_positive_int();
   }
 
   coord_t output_dim_size =
-      output_grad.shape.at(m.legion_dim).int_from_positive_int();
+      dim_at_idx(output_grad.shape.dims, m.dim).int_from_positive_int();
   coord_t input_dim_size =
-      input_grad.shape.at(m.legion_dim).int_from_positive_int();
+      dim_at_idx(input_grad.shape.dims, m.dim).int_from_positive_int();
 
-  assert(index.data_type == DataType::INT32 ||
-         index.data_type == DataType::INT64);
+  assert(index.shape.data_type == DataType::INT32 ||
+         index.shape.data_type == DataType::INT64);
 
   DataTypeDispatch1<BackwardKernel>{}(
-      index.data_type,
+      index.shape.data_type,
       stream,
       output_grad,
       index,
       input_grad,
-      output_grad.shape.num_elements().int_from_positive_int(),
-      stride,
+      get_num_elements(output_grad.shape.dims).int_from_positive_int(),
+      stride.value(),
       input_dim_size,
       output_dim_size);
 }
diff --git a/lib/kernels/src/cuda/ops/layer_norm_kernels.cu b/lib/kernels/src/cuda/ops/layer_norm_kernels.cu
index 40c3e79e41..31f26cca02 100644
--- a/lib/kernels/src/cuda/ops/layer_norm_kernels.cu
+++ b/lib/kernels/src/cuda/ops/layer_norm_kernels.cu
@@ -15,7 +15,7 @@
 
 #include "kernels/accessor.h"
 #include "kernels/datatype_dispatch.h"
-#include "kernels/layer_norm_kernels.h"
+#include "kernels/layer_norm_kernels_gpu.h"
 
 namespace FlexFlow {
 
@@ -289,12 +289,12 @@ __global__ void GammaBetaBackwardCUDAKernel(int64_t M,
 }
 
 // TODO: handle any data type for stats
-LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
-                                    Allocator &allocator,
-                                    bool elementwise_affine_,
-                                    int64_t effective_batch_size_,
-                                    int64_t effective_num_elements_,
-                                    float eps_) {
+LayerNormPerDeviceState gpu_init_kernel(PerDeviceFFHandle const &handle,
+                                        Allocator &allocator,
+                                        bool elementwise_affine_,
+                                        int64_t effective_batch_size_,
+                                        int64_t effective_num_elements_,
+                                        float eps_) {
   float *mean =
       (float *)allocator.allocate(sizeof(float) * effective_batch_size_);
   float *rstd =
@@ -307,18 +307,20 @@ LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
       (float *)allocator.allocate(sizeof(float) * effective_batch_size_);
   float *bias =
       (float *)allocator.allocate(sizeof(float) * effective_batch_size_);
-  LayerNormPerDeviceState per_device_state = {handle,
-                                              elementwise_affine_,
-                                              effective_batch_size_,
-                                              effective_num_elements_,
-                                              eps_,
-                                              mean,
-                                              rstd,
-                                              ds,
-                                              db,
-                                              scale,
-                                              bias,
-                                              DataType::FLOAT};
+  LayerNormPerDeviceState per_device_state = LayerNormPerDeviceState{
+      /*handle=*/handle,
+      /*elementwise_affine=*/elementwise_affine_,
+      /*effective_num_elements=*/effective_num_elements_,
+      /*effective_batch_size=*/effective_batch_size_,
+      /*eps=*/eps_,
+      /*mean=*/mean,
+      /*rstd=*/rstd,
+      /*ds=*/ds,
+      /*db=*/db,
+      /*scale=*/scale,
+      /*bias=*/bias,
+      /*data_type=*/DataType::FLOAT,
+  };
   return per_device_state;
 }
 
@@ -407,24 +409,24 @@ struct BackwardKernel {
   }
 };
 
-void forward_kernel(cudaStream_t stream,
-                    LayerNormPerDeviceState const &m,
-                    GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output,
-                    GenericTensorAccessorW const &gamma,
-                    GenericTensorAccessorW const &beta) {
+void gpu_forward_kernel(cudaStream_t stream,
+                        LayerNormPerDeviceState const &m,
+                        GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output,
+                        GenericTensorAccessorW const &gamma,
+                        GenericTensorAccessorW const &beta) {
   DataTypeDispatch1<ForwardKernel>{}(
       m.data_type, stream, m, input, output, gamma, beta);
 }
 
-void backward_kernel(cudaStream_t stream,
-                     LayerNormPerDeviceState const &m,
-                     GenericTensorAccessorR const &output_grad,
-                     GenericTensorAccessorR const &input,
-                     GenericTensorAccessorW const &input_grad,
-                     GenericTensorAccessorR const &gamma,
-                     GenericTensorAccessorW const &gamma_grad,
-                     GenericTensorAccessorW const &beta_grad) {
+void gpu_backward_kernel(cudaStream_t stream,
+                         LayerNormPerDeviceState const &m,
+                         GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorR const &input,
+                         GenericTensorAccessorW const &input_grad,
+                         GenericTensorAccessorR const &gamma,
+                         GenericTensorAccessorW const &gamma_grad,
+                         GenericTensorAccessorW const &beta_grad) {
   DataTypeDispatch1<BackwardKernel>{}(m.data_type,
                                       stream,
                                       m,
@@ -436,6 +438,10 @@ void backward_kernel(cudaStream_t stream,
                                       beta_grad);
 }
 
+void gpu_cleanup_kernel(LayerNormPerDeviceState const &per_device_state) {
+  NOT_IMPLEMENTED();
+}
+
 } // namespace LayerNorm
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/lib/kernels/src/cuda/ops/linear_kernels.cu b/lib/kernels/src/cuda/ops/linear_kernels.cu
index 02bda55828..fa474d854a 100644
--- a/lib/kernels/src/cuda/ops/linear_kernels.cu
+++ b/lib/kernels/src/cuda/ops/linear_kernels.cu
@@ -15,7 +15,7 @@
 
 #include "internal/device.h"
 #include "kernels/allocation.h"
-#include "kernels/linear_kernels.h"
+#include "kernels/linear_kernels_gpu.h"
 #include "utils/integer_conversions.h"
 
 namespace FlexFlow {
@@ -23,7 +23,7 @@ namespace FlexFlow {
 namespace Kernels {
 namespace Linear {
 
-bool use_activation(std::optional<Activation> activation) {
+static bool use_activation(std::optional<Activation> activation) {
   if (activation.has_value()) {
     switch (activation.value()) {
       case Activation::RELU:
@@ -40,17 +40,16 @@ bool use_activation(std::optional<Activation> activation) {
   return false;
 }
 
-// what's the float * one_ptr
-LinearPerDeviceState init_kernel(PerDeviceFFHandle handle,
-                                 float *one_ptr,
-                                 std::optional<Activation> activation,
-                                 std::optional<RegularizerAttrs> regularizer,
-                                 bool use_bias,
-                                 DataType input_type,
-                                 DataType weight_type,
-                                 DataType output_type,
-                                 int batch_size,
-                                 int channel) {
+LinearPerDeviceState
+    gpu_init_kernel(PerDeviceFFHandle handle,
+                    std::optional<Activation> activation,
+                    std::optional<RegularizerAttrs> regularizer,
+                    bool use_bias,
+                    DataType input_type,
+                    DataType weight_type,
+                    DataType output_type,
+                    int batch_size,
+                    int channel) {
   ffTensorDescriptor_t outputTensor;
   ffActivationDescriptor_t actiDesc;
   checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
@@ -82,6 +81,8 @@ LinearPerDeviceState init_kernel(PerDeviceFFHandle handle,
         // Unsupported activation mode
         assert(false);
     }
+  } else {
+    mode = CUDNN_ACTIVATION_IDENTITY;
   }
   checkCUDNN(
       cudnnSetActivationDescriptor(actiDesc, mode, CUDNN_PROPAGATE_NAN, 0.0));
@@ -91,30 +92,41 @@ LinearPerDeviceState init_kernel(PerDeviceFFHandle handle,
 
   // todo: how to use allocator to allocate memory for float * one_ptr, how many
   // bytes to allocate?
+  float *one_ptr;
   checkCUDA(cudaMalloc(&one_ptr, sizeof(float) * batch_size));
-  LinearPerDeviceState per_device_state = {handle,
-                                           outputTensor,
-                                           actiDesc,
-                                           one_ptr,
-                                           mode,
-                                           activation,
-                                           regularizer,
-                                           use_bias,
-                                           input_type,
-                                           weight_type,
-                                           output_type};
+  float one_ptr_cpu[batch_size];
+  for (int i = 0; i < batch_size; i++) {
+    one_ptr_cpu[i] = 1.0;
+  }
+  checkCUDA(cudaMemcpy(one_ptr,
+                       one_ptr_cpu,
+                       sizeof(float) * batch_size,
+                       cudaMemcpyHostToDevice));
+  LinearPerDeviceState per_device_state = LinearPerDeviceState{
+      /*handle=*/handle,
+      /*outputTensor=*/outputTensor,
+      /*actiDesc=*/actiDesc,
+      /*one_ptr=*/one_ptr,
+      /*mode=*/mode,
+      /*activation=*/activation,
+      /*regularizer=*/regularizer,
+      /*use_bias=*/use_bias,
+      /*input_type=*/input_type,
+      /*weight_type=*/weight_type,
+      /*output_type=*/output_type,
+  };
   return per_device_state;
 }
 
-void forward_kernel(cudaStream_t stream,
-                    LinearPerDeviceState const &m,
-                    float const *input_ptr,
-                    float *output_ptr,
-                    float const *weight_ptr,
-                    float const *bias_ptr,
-                    int in_dim,
-                    int out_dim,
-                    int batch_size) {
+void gpu_forward_kernel(cudaStream_t stream,
+                        LinearPerDeviceState const &m,
+                        float const *input_ptr,
+                        float *output_ptr,
+                        float const *weight_ptr,
+                        float const *bias_ptr,
+                        int in_dim,
+                        int out_dim,
+                        int batch_size) {
 
   checkCUBLAS(cublasSetStream(m.handle.blas, stream));
   checkCUDNN(cudnnSetStream(m.handle.dnn, stream));
@@ -147,10 +159,9 @@ void forward_kernel(cudaStream_t stream,
                            out_dim,
                            compute_type,
                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-  // use_bias = True
-  if (bias_ptr != NULL) {
+  if (bias_ptr != nullptr) {
     checkCUBLAS(cublasGemmEx(m.handle.blas,
-                             CUBLAS_OP_T,
+                             CUBLAS_OP_N,
                              CUBLAS_OP_N,
                              out_dim,
                              batch_size,
@@ -158,7 +169,7 @@ void forward_kernel(cudaStream_t stream,
                              &alpha,
                              static_cast<void const *>(bias_ptr),
                              weight_type,
-                             1,
+                             out_dim,
                              static_cast<void const *>(m.one_ptr),
                              CUDA_R_32F,
                              1,
@@ -169,38 +180,38 @@ void forward_kernel(cudaStream_t stream,
                              compute_type,
                              CUBLAS_GEMM_DEFAULT_TENSOR_OP));
   }
-  if (use_activation(m.activation)) {
-    checkCUDNN(cudnnActivationForward(m.handle.dnn,
-                                      m.actiDesc,
-                                      &alpha,
-                                      m.outputTensor,
-                                      static_cast<void *>(output_ptr),
-                                      &beta,
-                                      m.outputTensor,
-                                      static_cast<void *>(output_ptr)));
-  } else if (m.activation == Activation::GELU) {
-    size_t elements = size_t_from_int(out_dim) * size_t_from_int(batch_size);
-    constexpr float B = 0.7978845608028654f;   // sqrt(2.0/M_PI)
-    constexpr float C = 0.035677408136300125f; // 0.044715 * sqrt(2.0/M_PI)
-    gelu_forward_kernel<<<GET_BLOCKS(elements), CUDA_NUM_THREADS>>>(
-        elements, B, C, (float *)output_ptr);
-  } else {
-    // Do nothing
-  }
+  // if (use_activation(m.activation)) {
+  //   checkCUDNN(cudnnActivationForward(m.handle.dnn,
+  //                                     m.actiDesc,
+  //                                     &alpha,
+  //                                     m.outputTensor,
+  //                                     static_cast<void *>(output_ptr),
+  //                                     &beta,
+  //                                     m.outputTensor,
+  //                                     static_cast<void *>(output_ptr)));
+  // } else if (m.activation == Activation::GELU) {
+  //   size_t elements = size_t_from_int(out_dim) * size_t_from_int(batch_size);
+  //   constexpr float B = 0.7978845608028654f;   // sqrt(2.0/M_PI)
+  //   constexpr float C = 0.035677408136300125f; // 0.044715 * sqrt(2.0/M_PI)
+  //   gelu_forward_kernel<<<GET_BLOCKS(elements), CUDA_NUM_THREADS>>>(
+  //       elements, B, C, (float *)output_ptr);
+  // } else {
+  //   // Do nothing
+  // }
 }
 
-void backward_kernel(cudaStream_t stream,
-                     LinearPerDeviceState const &m,
-                     float const *output_ptr,
-                     float *output_grad_ptr,
-                     float const *input_ptr,
-                     float *input_grad_ptr,
-                     float const *kernel_ptr,
-                     float *kernel_grad_ptr,
-                     float *bias_grad_ptr,
-                     int in_dim,
-                     int out_dim,
-                     int batch_size) {
+void gpu_backward_kernel(cudaStream_t stream,
+                         LinearPerDeviceState const &m,
+                         float const *output_ptr,
+                         float *output_grad_ptr,
+                         float const *input_ptr,
+                         float *input_grad_ptr,
+                         float const *kernel_ptr,
+                         float *kernel_grad_ptr,
+                         float *bias_grad_ptr,
+                         int in_dim,
+                         int out_dim,
+                         int batch_size) {
   checkCUBLAS(cublasSetStream(m.handle.blas, stream));
   checkCUDNN(cudnnSetStream(m.handle.dnn, stream));
   float alpha = 1.0f;
@@ -229,9 +240,10 @@ void backward_kernel(cudaStream_t stream,
                               stream);
     } else {
       // TODO: only support relu and sigmoid for now
-      assert(false && "Unsupported activation for Linear");
+      PANIC("Unsupported activation for Linear", m.activation.value());
     }
   }
+
   // Compute weight gradiant
   // NOTE: we use alpha=1 for kernel_grad to accumulate gradients
   checkCUBLAS(cublasGemmEx(m.handle.blas,
@@ -328,6 +340,10 @@ void backward_kernel(cudaStream_t stream,
   }
 }
 
+void gpu_cleanup_kernel(LinearPerDeviceState &per_device_state) {
+  NOT_IMPLEMENTED();
+}
+
 } // namespace Linear
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/lib/kernels/src/cuda/ops/partition_kernels.cu b/lib/kernels/src/cuda/ops/partition_kernels.cu
deleted file mode 100644
index 94690a74fb..0000000000
--- a/lib/kernels/src/cuda/ops/partition_kernels.cu
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "internal/device.h"
-#include "kernels/datatype_dispatch.h"
-#include "kernels/partition_kernels.h"
-
-namespace FlexFlow {
-namespace Kernels {
-namespace Repartition {
-
-template <DataType T>
-struct ForwardKernel {
-  void operator()(cudaStream_t stream,
-                  RepartitionPerDeviceState const &m,
-                  GenericTensorAccessorR const &input,
-                  GenericTensorAccessorW const &output) {
-    checkCUDA(
-        cudaMemcpyAsync(output.get<T>(),
-                        input.get<T>(),
-                        input.shape.num_elements().int_from_positive_int() *
-                            size_of_datatype(T).int_from_positive_int(),
-                        cudaMemcpyDeviceToDevice,
-                        stream));
-  }
-};
-
-template <DataType T>
-struct BackwardKernel {
-  void operator()(cudaStream_t stream,
-                  RepartitionPerDeviceState const &m,
-                  GenericTensorAccessorR const &output_grad,
-                  GenericTensorAccessorW const &input_grad) {
-    add_kernel<real_type_t<T>>
-        <<<GET_BLOCKS(input_grad.shape.num_elements().int_from_positive_int()),
-           CUDA_NUM_THREADS,
-           0,
-           stream>>>(input_grad.get<T>(),
-                     output_grad.get<T>(),
-                     input_grad.shape.num_elements().int_from_positive_int());
-  }
-};
-
-RepartitionPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
-                                      DataType data_type) {
-  RepartitionPerDeviceState per_device_state = {handle, data_type};
-  return per_device_state;
-}
-
-void forward_kernel(cudaStream_t stream,
-                    RepartitionPerDeviceState const &m,
-                    GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output) {
-  DataTypeDispatch1<ForwardKernel>{}(m.data_type, stream, m, input, output);
-}
-
-void backward_kernel(cudaStream_t stream,
-                     RepartitionPerDeviceState const &m,
-                     GenericTensorAccessorR const &output_grad,
-                     GenericTensorAccessorW const &input_grad) {
-  DataTypeDispatch1<BackwardKernel>{}(
-      m.data_type, stream, m, output_grad, input_grad);
-}
-
-} // namespace Repartition
-} // namespace Kernels
-} // namespace FlexFlow
diff --git a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu
index e8ea3f64c2..ec185a360e 100644
--- a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu
+++ b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu
@@ -14,30 +14,30 @@
  */
 
 #include "internal/device.h"
-#include "kernels/pool_2d_kernels.h"
+#include "kernels/pool_2d_kernels_gpu.h"
 
 namespace FlexFlow {
 
 namespace Kernels {
 namespace Pool2D {
 
-Pool2DPerDeviceState init_kernel(PerDeviceFFHandle handle,
-                                 std::optional<Activation> activation,
-                                 int input_w,
-                                 int input_h,
-                                 int input_c,
-                                 int input_n,
-                                 int output_w,
-                                 int output_h,
-                                 int output_c,
-                                 int output_n,
-                                 int pad_h,
-                                 int pad_w,
-                                 int kernel_h,
-                                 int kernel_w,
-                                 int stride_h,
-                                 int stride_w,
-                                 PoolOp pool_type) {
+Pool2DPerDeviceState gpu_init_kernel(PerDeviceFFHandle handle,
+                                     std::optional<Activation> activation,
+                                     int input_w,
+                                     int input_h,
+                                     int input_c,
+                                     int input_n,
+                                     int output_w,
+                                     int output_h,
+                                     int output_c,
+                                     int output_n,
+                                     int pad_h,
+                                     int pad_w,
+                                     int kernel_h,
+                                     int kernel_w,
+                                     int stride_h,
+                                     int stride_w,
+                                     PoolOp pool_type) {
   ffTensorDescriptor_t inputTensor;
   ffTensorDescriptor_t outputTensor;
   ffActivationDescriptor_t actiDesc;
@@ -87,15 +87,21 @@ Pool2DPerDeviceState init_kernel(PerDeviceFFHandle handle,
   if (activation == Activation::RELU) {
     relu = true;
   }
-  Pool2DPerDeviceState state = {
-      handle, inputTensor, outputTensor, actiDesc, poolDesc, relu};
+  Pool2DPerDeviceState state = Pool2DPerDeviceState{
+      /*handle=*/handle,
+      /*inputTensor=*/inputTensor,
+      /*outputTensor=*/outputTensor,
+      /*actiDesc=*/actiDesc,
+      /*poolDesc=*/poolDesc,
+      /*relu=*/relu,
+  };
   return state;
 }
 
-void forward_kernel(cudaStream_t stream,
-                    Pool2DPerDeviceState const &m,
-                    void const *input_ptr,
-                    void *output_ptr) {
+void gpu_forward_kernel(cudaStream_t stream,
+                        Pool2DPerDeviceState const &m,
+                        void const *input_ptr,
+                        void *output_ptr) {
 
   checkCUDNN(cudnnSetStream(m.handle.dnn, stream));
 
@@ -110,12 +116,12 @@ void forward_kernel(cudaStream_t stream,
                                  output_ptr));
 }
 
-void backward_kernel(cudaStream_t stream,
-                     Pool2DPerDeviceState const &m,
-                     void const *output_ptr,
-                     void const *output_grad_ptr,
-                     void const *input_ptr,
-                     void *input_grad_ptr) {
+void gpu_backward_kernel(cudaStream_t stream,
+                         Pool2DPerDeviceState const &m,
+                         void const *output_ptr,
+                         void const *output_grad_ptr,
+                         void const *input_ptr,
+                         void *input_grad_ptr) {
 
   checkCUDNN(cudnnSetStream(m.handle.dnn, stream));
 
@@ -134,6 +140,10 @@ void backward_kernel(cudaStream_t stream,
                                   input_grad_ptr));
 }
 
+void gpu_cleanup_kernel(Pool2DPerDeviceState &per_device_state) {
+  NOT_IMPLEMENTED();
+}
+
 } // namespace Pool2D
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/lib/kernels/src/cuda/ops/reduce_kernels.cu b/lib/kernels/src/cuda/ops/reduce_kernels.cu
index 563bbae21d..20c974e4d8 100644
--- a/lib/kernels/src/cuda/ops/reduce_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reduce_kernels.cu
@@ -14,17 +14,17 @@
  */
 
 #include "internal/device.h"
-#include "kernels/reduce_kernels.h"
+#include "kernels/reduce_kernels_gpu.h"
 
 namespace FlexFlow {
 namespace Kernels {
 namespace Reduce {
 
-ReducePerDeviceState init_kernel(PerDeviceFFHandle const &handle,
-                                 OperatorType const &op_type,
-                                 size_t const &reduction_size,
-                                 ArrayShape const &input_shape,
-                                 ArrayShape const &output_shape) {
+ReducePerDeviceState gpu_init_kernel(PerDeviceFFHandle const &handle,
+                                     OperatorType const &op_type,
+                                     size_t const &reduction_size,
+                                     TensorShape const &input_shape,
+                                     TensorShape const &output_shape) {
 
   ffTensorDescriptor_t inputTensor;
   ffTensorDescriptor_t outputTensor;
@@ -35,19 +35,25 @@ ReducePerDeviceState init_kernel(PerDeviceFFHandle const &handle,
 
   checkCUDNN(cudnnCreateReduceTensorDescriptor(&reduceDesc));
 
-  checkCUDNN(cudnnSetTensorDescriptorFromArrayShape(inputTensor, input_shape));
+  checkCUDNN(cudnnSetTensorDescriptorFromTensorShape(inputTensor, input_shape));
   checkCUDNN(
-      cudnnSetTensorDescriptorFromArrayShape(outputTensor, output_shape));
+      cudnnSetTensorDescriptorFromTensorShape(outputTensor, output_shape));
 
-  ReducePerDeviceState per_device = {
-      handle, inputTensor, outputTensor, reduceDesc, op_type, reduction_size};
+  ReducePerDeviceState per_device = ReducePerDeviceState{
+      /*handle=*/handle,
+      /*inputTensor=*/inputTensor,
+      /*outputTensor=*/outputTensor,
+      /*reduceDesc=*/reduceDesc,
+      /*op_type=*/op_type,
+      /*reduction_size=*/reduction_size,
+  };
   return per_device;
 }
 
-void forward_kernel(cudaStream_t stream,
-                    ReducePerDeviceState const &m,
-                    float const *input_ptr,
-                    float *output_ptr) {
+void gpu_forward_kernel(cudaStream_t stream,
+                        ReducePerDeviceState const &m,
+                        float const *input_ptr,
+                        float *output_ptr) {
   checkCUDNN(cudnnSetStream(m.handle.dnn, stream));
   float alpha = 1.0f, beta = 0.0f;
   checkCUDNN(cudnnReduceTensor(m.handle.dnn,
@@ -64,10 +70,10 @@ void forward_kernel(cudaStream_t stream,
                                output_ptr));
 };
 
-void backward_kernel(cudaStream_t stream,
-                     ReducePerDeviceState const &m,
-                     float const *output_grad_ptr,
-                     float *input_grad_ptr) {
+void gpu_backward_kernel(cudaStream_t stream,
+                         ReducePerDeviceState const &m,
+                         float const *output_grad_ptr,
+                         float *input_grad_ptr) {
   checkCUDNN(cudnnSetStream(m.handle.dnn, stream));
   float alpha = 1.0, beta = 1.0f;
   switch (m.op_type) {
diff --git a/lib/kernels/src/cuda/ops/reduction_kernels.cu b/lib/kernels/src/cuda/ops/reduction_kernels.cu
deleted file mode 100644
index 93400d333f..0000000000
--- a/lib/kernels/src/cuda/ops/reduction_kernels.cu
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "internal/device.h"
-#include "kernels/datatype_dispatch.h"
-#include "kernels/reduction_kernels.h"
-
-namespace FlexFlow {
-namespace Kernels {
-namespace Reduction {
-
-template <typename T>
-__global__ void reduction_forward_kernel(T const *input_ptr,
-                                         T *output_ptr,
-                                         size_t num_elements,
-                                         size_t num_replicas) {
-  CUDA_KERNEL_LOOP(i, num_elements) {
-    output_ptr[i] = input_ptr[i];
-    for (size_t j = 1; j < num_replicas; j++) {
-      output_ptr[i] += input_ptr[i + j * num_elements];
-    }
-  }
-}
-
-template <DataType T>
-struct ForwardKernel {
-  void operator()(cudaStream_t stream,
-                  GenericTensorAccessorR const &input,
-                  GenericTensorAccessorW const &output,
-                  size_t num_replicas) {
-
-    size_t total_elements =
-        input.shape.num_elements().int_from_positive_int() * num_replicas;
-    reduction_forward_kernel<real_type_t<T>>
-        <<<GET_BLOCKS(total_elements), CUDA_NUM_THREADS, 0, stream>>>(
-            input.get<T>(),
-            output.get<T>(),
-            input.shape.num_elements().int_from_positive_int(),
-            num_replicas);
-  }
-};
-
-template <DataType T>
-struct BackwardKernel {
-  void operator()(cudaStream_t stream,
-                  GenericTensorAccessorR const &output,
-                  GenericTensorAccessorW const &input) {
-    checkCUDA(
-        cudaMemcpyAsync(input.get<T>(),
-                        output.get<T>(),
-                        input.shape.num_elements().int_from_positive_int() *
-                            size_of_datatype(T).int_from_positive_int(),
-                        cudaMemcpyDeviceToDevice,
-                        stream));
-  }
-};
-
-void forward_kernel(cudaStream_t stream,
-                    GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output,
-                    size_t num_replicas) {
-  DataTypeDispatch1<ForwardKernel>{}(
-      input.data_type, stream, input, output, num_replicas);
-}
-
-void backward_kernel(cudaStream_t stream,
-                     GenericTensorAccessorR const &output,
-                     GenericTensorAccessorW const &input) {
-  DataTypeDispatch1<BackwardKernel>{}(output.data_type, stream, output, input);
-}
-
-} // namespace Reduction
-} // namespace Kernels
-} // namespace FlexFlow
diff --git a/lib/kernels/src/cuda/ops/replicate_kernels.cu b/lib/kernels/src/cuda/ops/replicate_kernels.cu
deleted file mode 100644
index 9f532c96b1..0000000000
--- a/lib/kernels/src/cuda/ops/replicate_kernels.cu
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "internal/device.h"
-#include "kernels/datatype_dispatch.h"
-#include "kernels/replicate_kernels.h"
-
-namespace FlexFlow {
-namespace Kernels {
-namespace Replicate {
-
-template <typename T>
-__global__ void replicate_backward_kernel(T const *output_ptr,
-                                          T *input_ptr,
-                                          size_t num_elements,
-                                          size_t num_replicas) {
-  CUDA_KERNEL_LOOP(i, num_elements) {
-    for (size_t j = 0; j < num_replicas; j++) {
-      input_ptr[i] += output_ptr[i + j * num_elements];
-    }
-  }
-}
-
-template <DataType T>
-struct ForwardKernel {
-  void operator()(cudaStream_t stream,
-                  GenericTensorAccessorR const &input,
-                  GenericTensorAccessorW const &output) {
-    checkCUDA(
-        cudaMemcpyAsync((void *)output.get<T>(),
-                        (void *)input.get<T>(),
-                        input.shape.num_elements().int_from_positive_int() *
-                            size_of_datatype(T).int_from_positive_int(),
-                        cudaMemcpyDeviceToDevice,
-                        stream));
-  }
-};
-
-template <DataType T>
-struct BackwardKernel {
-  void operator()(cudaStream_t stream,
-                  GenericTensorAccessorR const &output,
-                  GenericTensorAccessorW const &input,
-                  size_t num_replicas) {
-    size_t total_elements =
-        input.shape.num_elements().int_from_positive_int() * num_replicas;
-    replicate_backward_kernel<real_type_t<T>>
-        <<<GET_BLOCKS(total_elements), CUDA_NUM_THREADS, 0, stream>>>(
-            output.get<T>(),
-            input.get<T>(),
-            input.shape.num_elements().int_from_positive_int(),
-            num_replicas);
-  }
-};
-
-void forward_kernel(cudaStream_t stream,
-                    GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output) {
-  DataTypeDispatch1<ForwardKernel>{}(input.data_type, stream, input, output);
-}
-
-void backward_kernel(cudaStream_t stream,
-                     GenericTensorAccessorR const &output,
-                     GenericTensorAccessorW const &input,
-                     size_t num_replicas) {
-  DataTypeDispatch1<BackwardKernel>{}(
-      input.data_type, stream, output, input, num_replicas);
-}
-
-} // namespace Replicate
-} // namespace Kernels
-} // namespace FlexFlow
diff --git a/lib/kernels/src/cuda/ops/reshape_kernels.cu b/lib/kernels/src/cuda/ops/reshape_kernels.cu
index 3f0d6bb15a..1414706ef0 100644
--- a/lib/kernels/src/cuda/ops/reshape_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reshape_kernels.cu
@@ -15,61 +15,52 @@
 
 #include "internal/device.h"
 #include "kernels/datatype_dispatch.h"
-#include "kernels/reshape_kernels.h"
+#include "kernels/reshape_kernels_gpu.h"
 
 namespace FlexFlow {
 
 namespace Kernels {
 namespace Reshape {
 
-ReshapePerDeviceState init_kernel(DataType data_type) {
-  return ReshapePerDeviceState{data_type};
-}
-
-template <DataType T>
-struct ForwardKernel {
-  void operator()(cudaStream_t stream,
-                  GenericTensorAccessorR const &input,
-                  GenericTensorAccessorW const &output) {
-    checkCUDA(
-        cudaMemcpyAsync(output.get<T>(),
-                        input.get<T>(),
-                        input.shape.num_elements().int_from_positive_int() *
-                            size_of_datatype(T).int_from_positive_int(),
-                        cudaMemcpyDeviceToDevice,
-                        stream));
+template <typename DT, typename DTGrad>
+__global__ void apply_add_with_scale2(DT *data_ptr,
+                                      DTGrad const *grad_ptr,
+                                      size_t size,
+                                      DT scale) {
+  CUDA_KERNEL_LOOP(i, size) {
+    data_ptr[i] += grad_ptr[i] * scale;
   }
-};
+}
 
-template <DataType T>
+template <DataType InputDT, DataType OutputDT>
 struct BackwardKernel {
   void operator()(cudaStream_t stream,
                   GenericTensorAccessorR const &output,
                   GenericTensorAccessorW const &input) {
     float alpha = 1.0f;
-    apply_add_with_scale<real_type_t<T>>
-        <<<GET_BLOCKS(input.shape.num_elements().int_from_positive_int()),
+    apply_add_with_scale2<real_type_t<InputDT>, real_type_t<OutputDT>>
+        <<<GET_BLOCKS(
+               get_num_elements(input.shape.dims).int_from_positive_int()),
            CUDA_NUM_THREADS,
            0,
-           stream>>>(input.get<T>(),
-                     output.get<T>(),
-                     input.shape.num_elements().int_from_positive_int(),
-                     static_cast<real_type_t<T>>(alpha));
+           stream>>>(input.get<InputDT>(),
+                     output.get<OutputDT>(),
+                     get_num_elements(input.shape.dims).int_from_positive_int(),
+                     static_cast<real_type_t<InputDT>>(alpha));
   }
 };
 
-void forward_kernel(cudaStream_t stream,
-                    ReshapePerDeviceState const &m,
-                    GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output) {
-  DataTypeDispatch1<ForwardKernel>{}(m.data_type, stream, input, output);
+void gpu_forward_kernel(cudaStream_t stream,
+                        GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output) {
+  copy_accessor_data_to_l_from_r(output, input);
 }
 
-void backward_kernel(cudaStream_t stream,
-                     ReshapePerDeviceState const &m,
-                     GenericTensorAccessorR const &output,
-                     GenericTensorAccessorW const &input) {
-  DataTypeDispatch1<BackwardKernel>{}(m.data_type, stream, output, input);
+void gpu_backward_kernel(cudaStream_t stream,
+                         GenericTensorAccessorR const &output,
+                         GenericTensorAccessorW const &input) {
+  DataTypeDispatch2<BackwardKernel>{}(
+      input.shape.data_type, output.shape.data_type, stream, output, input);
 }
 
 } // namespace Reshape
diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu
index c63be7f9b4..1fabf4a67e 100644
--- a/lib/kernels/src/cuda/ops/reverse_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu
@@ -14,7 +14,7 @@
  */
 
 #include "internal/device.h"
-#include "kernels/reverse_kernels.h"
+#include "kernels/reverse_kernels_gpu.h"
 #include "kernels/reverse_kernels_params.h"
 
 namespace FlexFlow::Kernels::Reverse {
@@ -51,13 +51,13 @@ static void forward_kernel_internal(cudaStream_t stream,
       in_ptr, out_ptr, num_out_blks, reverse_dim_size, in_blk_size);
 }
 
-void forward_kernel(ffStream_t stream,
-                    GenericTensorAccessorR const &input_accessor,
-                    GenericTensorAccessorW &output_accessor,
-                    ReverseAttrs const &attrs) {
+void gpu_forward_kernel(ffStream_t stream,
+                        GenericTensorAccessorR const &input_accessor,
+                        GenericTensorAccessorW &output_accessor,
+                        ReverseAttrs const &attrs) {
 
   auto reverse_kernels_params =
-      compute_reverse_kernels_params(output_accessor.shape, attrs);
+      compute_reverse_kernels_params(output_accessor.shape.dims, attrs);
 
   forward_kernel_internal(
       stream,
@@ -84,12 +84,12 @@ void backward_kernel_internal(cudaStream_t stream,
       out_grad_ptr, in_grad_ptr, num_out_blks, reverse_dim_size, in_blk_size);
 }
 
-void backward_kernel(ffStream_t stream,
-                     GenericTensorAccessorR const &output_grad_accessor,
-                     GenericTensorAccessorW &input_grad_accessor,
-                     ReverseAttrs const &attrs) {
+void gpu_backward_kernel(ffStream_t stream,
+                         GenericTensorAccessorR const &output_grad_accessor,
+                         GenericTensorAccessorW &input_grad_accessor,
+                         ReverseAttrs const &attrs) {
   auto reverse_kernels_params =
-      compute_reverse_kernels_params(input_grad_accessor.shape, attrs);
+      compute_reverse_kernels_params(input_grad_accessor.shape.dims, attrs);
 
   backward_kernel_internal(
       stream,
diff --git a/lib/kernels/src/cuda/ops/softmax_kernels.cu b/lib/kernels/src/cuda/ops/softmax_kernels.cu
index da0ffd846e..85575d7bf6 100644
--- a/lib/kernels/src/cuda/ops/softmax_kernels.cu
+++ b/lib/kernels/src/cuda/ops/softmax_kernels.cu
@@ -14,19 +14,19 @@
  */
 
 #include "internal/device.h"
-#include "kernels/softmax_kernels.h"
+#include "kernels/softmax_kernels_gpu.h"
 
 namespace FlexFlow {
 
 namespace Kernels {
 namespace Softmax {
 
-SoftmaxPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
-                                  int dim,
-                                  int input_n,
-                                  int input_c,
-                                  int input_h,
-                                  int input_w) {
+SoftmaxPerDeviceState gpu_init_kernel(PerDeviceFFHandle const &handle,
+                                      ff_dim_t dim,
+                                      int input_n,
+                                      int input_c,
+                                      int input_h,
+                                      int input_w) {
   ffTensorDescriptor_t inputTensor;
 
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
@@ -38,14 +38,18 @@ SoftmaxPerDeviceState init_kernel(PerDeviceFFHandle const &handle,
                                         input_h,
                                         input_w));
 
-  SoftmaxPerDeviceState per_device_state = {handle, inputTensor, dim};
+  SoftmaxPerDeviceState per_device_state = SoftmaxPerDeviceState{
+      /*handle=*/handle,
+      /*inputTensor=*/inputTensor,
+      /*dim=*/dim,
+  };
   return per_device_state;
 }
 
-void forward_kernel(cudaStream_t stream,
-                    SoftmaxPerDeviceState const &m,
-                    float const *input_ptr,
-                    float *output_ptr) {
+void gpu_forward_kernel(cudaStream_t stream,
+                        SoftmaxPerDeviceState const &m,
+                        float const *input_ptr,
+                        float *output_ptr) {
   checkCUDNN(cudnnSetStream(m.handle.dnn, stream));
 
   float alpha = 1.0f, beta = 0.0f;
@@ -60,10 +64,10 @@ void forward_kernel(cudaStream_t stream,
                                  output_ptr));
 }
 
-void backward_kernel(cudaStream_t stream,
-                     float const *output_grad_ptr,
-                     float *input_grad_ptr,
-                     size_t num_elements) {
+void gpu_backward_kernel(cudaStream_t stream,
+                         float const *output_grad_ptr,
+                         float *input_grad_ptr,
+                         size_t num_elements) {
 
   checkCUDA(cudaMemcpyAsync(input_grad_ptr,
                             output_grad_ptr,
@@ -72,6 +76,10 @@ void backward_kernel(cudaStream_t stream,
                             stream));
 }
 
+void gpu_cleanup_kernel(SoftmaxPerDeviceState &) {
+  NOT_IMPLEMENTED();
+}
+
 } // namespace Softmax
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/lib/kernels/src/cuda/ops/split_kernels.cu b/lib/kernels/src/cuda/ops/split_kernels.cu
index 5c8b305851..ca953bd7b1 100644
--- a/lib/kernels/src/cuda/ops/split_kernels.cu
+++ b/lib/kernels/src/cuda/ops/split_kernels.cu
@@ -14,20 +14,20 @@
  */
 
 #include "internal/device.h"
-#include "kernels/split_kernels.h"
+#include "kernels/split_kernels_gpu.h"
 
 namespace FlexFlow {
 
 namespace Kernels {
 namespace Split {
 
-void forward_kernel(cudaStream_t stream,
-                    float **out_ptrs,
-                    float const *in_ptr,
-                    coord_t const *out_blk_sizes,
-                    coord_t in_blk_size,
-                    coord_t num_blks,
-                    int numOutputs) {
+void gpu_forward_kernel(cudaStream_t stream,
+                        float **out_ptrs,
+                        float const *in_ptr,
+                        int const *out_blk_sizes,
+                        int in_blk_size,
+                        int num_blks,
+                        int numOutputs) {
 
   for (int i = 0; i < numOutputs; i++) {
     copy_with_stride<<<GET_BLOCKS(out_blk_sizes[i] * num_blks),
@@ -39,13 +39,13 @@ void forward_kernel(cudaStream_t stream,
   }
 }
 
-void backward_kernel(cudaStream_t stream,
-                     float *in_grad_ptr,
-                     float const **out_grad_ptr,
-                     coord_t const *out_blk_sizes,
-                     coord_t in_blk_size,
-                     coord_t num_blks,
-                     int numOutputs) {
+void gpu_backward_kernel(cudaStream_t stream,
+                         float *in_grad_ptr,
+                         float const **out_grad_ptr,
+                         int const *out_blk_sizes,
+                         int in_blk_size,
+                         int num_blks,
+                         int numOutputs) {
 
   for (int i = 0; i < numOutputs; i++) {
     add_with_stride<<<GET_BLOCKS(out_blk_sizes[i] * num_blks),
diff --git a/lib/kernels/src/cuda/ops/topk_kernels.cu b/lib/kernels/src/cuda/ops/topk_kernels.cu
index 3824c57b32..f9a3d72340 100644
--- a/lib/kernels/src/cuda/ops/topk_kernels.cu
+++ b/lib/kernels/src/cuda/ops/topk_kernels.cu
@@ -14,7 +14,7 @@
  */
 
 #include "internal/device.h"
-#include "kernels/topk_kernels.h"
+#include "kernels/topk_kernels_gpu.h"
 
 namespace FlexFlow {
 
@@ -30,11 +30,6 @@ struct Entry {
   T value;
 };
 
-TopKPerDeviceState init_kernel(bool sorted) {
-  TopKPerDeviceState per_device_state = {sorted};
-  return per_device_state;
-}
-
 template <typename T>
 struct LinearData {
   typedef Entry<T> Entry;
@@ -369,15 +364,14 @@ __global__ void topk_forward_kernel(T const *__restrict__ input,
   }
 }
 
-void forward_kernel(cudaStream_t stream,
-                    TopKPerDeviceState const &m,
-                    float const *input_ptr,
-                    float *output_ptr,
-                    int *indices_ptr,
-                    size_t batch_size,
-                    int length,
-                    int k,
-                    bool sorted) {
+void gpu_forward_kernel(cudaStream_t stream,
+                        float const *input_ptr,
+                        float *output_ptr,
+                        int *indices_ptr,
+                        size_t batch_size,
+                        int length,
+                        int k,
+                        bool sorted) {
   // Adopted from TensorFlow's TopK implementation
   // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h
   int num_shards = 0;
@@ -421,14 +415,13 @@ __global__ void topk_backward_kernel(T const *__restrict__ value_grad_ptr,
   }
 }
 
-void backward_kernel(cudaStream_t stream,
-                     TopKPerDeviceState const &m,
-                     float const *value_grad_ptr,
-                     int const *indices_ptr,
-                     float *in_grad_ptr,
-                     size_t batch_size,
-                     int length,
-                     int k) {
+void gpu_backward_kernel(cudaStream_t stream,
+                         float const *value_grad_ptr,
+                         int const *indices_ptr,
+                         float *in_grad_ptr,
+                         size_t batch_size,
+                         int length,
+                         int k) {
   topk_backward_kernel<<<GET_BLOCKS(batch_size * k),
                          CUDA_NUM_THREADS,
                          0,
diff --git a/lib/kernels/src/cuda/ops/transpose_kernels.cu b/lib/kernels/src/cuda/ops/transpose_kernels.cu
index 4e3c69eedf..85a259769c 100644
--- a/lib/kernels/src/cuda/ops/transpose_kernels.cu
+++ b/lib/kernels/src/cuda/ops/transpose_kernels.cu
@@ -16,7 +16,7 @@
 #include "internal/device.h"
 #include "kernels/accessor.h"
 #include "kernels/legion_ordered/transform.h"
-#include "kernels/transpose_kernels.h"
+#include "kernels/transpose_kernels_gpu.h"
 #include "utils/exception.h"
 #include "utils/nonnegative_int/num_elements.h"
 
@@ -59,13 +59,13 @@ static LegionOrdered<legion_dim_t>
   return legion_ordered_perm;
 }
 
-void forward_kernel(cudaStream_t stream,
-                    TransposeAttrs const &m,
-                    GenericTensorAccessorR const &input,
-                    GenericTensorAccessorW const &output) {
+void gpu_forward_kernel(cudaStream_t stream,
+                        TransposeAttrs const &m,
+                        GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output) {
 
   TransposeStrides info;
-  info.num_dim = input.shape.num_dims().unwrap_nonnegative();
+  info.num_dim = get_num_dims(input.shape.dims).unwrap_nonnegative();
   assert(info.num_dim == m.perm.size());
 
   LegionOrdered<legion_dim_t> legion_ordered_perm =
@@ -76,10 +76,12 @@ void forward_kernel(cudaStream_t stream,
       info.in_strides[i] = 1;
       info.out_strides[i] = 1;
     } else {
-      int in_dim_size = input.shape.at(legion_dim_t{nonnegative_int{i}})
-                            .int_from_positive_int();
-      int out_dim_size = output.shape.at(legion_dim_t{nonnegative_int{i}})
-                             .int_from_positive_int();
+      int in_dim_size =
+          dim_at_idx(input.shape.dims, legion_dim_t{nonnegative_int{i}})
+              .int_from_positive_int();
+      int out_dim_size =
+          dim_at_idx(output.shape.dims, legion_dim_t{nonnegative_int{i}})
+              .int_from_positive_int();
       info.in_strides[i] = info.in_strides[i - 1] * in_dim_size;
       info.out_strides[i] = info.out_strides[i - 1] * out_dim_size;
     }
@@ -88,23 +90,23 @@ void forward_kernel(cudaStream_t stream,
                        .value.unwrap_nonnegative();
   }
   transpose_simple_kernel<<<
-      GET_BLOCKS(output.shape.num_elements().int_from_positive_int()),
+      GET_BLOCKS(get_num_elements(output.shape.dims).int_from_positive_int()),
       CUDA_NUM_THREADS,
       0,
-      stream>>>(output.shape.num_elements().int_from_positive_int(),
+      stream>>>(get_num_elements(output.shape.dims).int_from_positive_int(),
                 input.get_float_ptr(),
                 output.get_float_ptr(),
                 info,
-                0.0f /*beta*/);
+                /*beta=*/0.0f);
 }
 
-void backward_kernel(cudaStream_t stream,
-                     TransposeAttrs const &m,
-                     GenericTensorAccessorR const &out_grad,
-                     GenericTensorAccessorW const &in_grad) {
+void gpu_backward_kernel(cudaStream_t stream,
+                         TransposeAttrs const &m,
+                         GenericTensorAccessorR const &out_grad,
+                         GenericTensorAccessorW const &in_grad) {
 
   TransposeStrides info;
-  info.num_dim = in_grad.shape.num_dims().unwrap_nonnegative();
+  info.num_dim = get_num_dims(in_grad.shape.dims).unwrap_nonnegative();
   assert(info.num_dim == m.perm.size());
 
   LegionOrdered<legion_dim_t> legion_ordered_perm =
@@ -115,10 +117,12 @@ void backward_kernel(cudaStream_t stream,
       info.in_strides[i] = 1;
       info.out_strides[i] = 1;
     } else {
-      int in_dim_size = out_grad.shape.at(legion_dim_t{nonnegative_int{i}})
-                            .int_from_positive_int();
-      int out_dim_size = in_grad.shape.at(legion_dim_t{nonnegative_int{i}})
-                             .int_from_positive_int();
+      int in_dim_size =
+          dim_at_idx(out_grad.shape.dims, legion_dim_t{nonnegative_int{i}})
+              .int_from_positive_int();
+      int out_dim_size =
+          dim_at_idx(in_grad.shape.dims, legion_dim_t{nonnegative_int{i}})
+              .int_from_positive_int();
       info.in_strides[i] = info.in_strides[i - 1] * in_dim_size;
       info.out_strides[i] = info.out_strides[i - 1] * out_dim_size;
     }
@@ -126,14 +130,14 @@ void backward_kernel(cudaStream_t stream,
                   .value.unwrap_nonnegative()] = i;
   }
   transpose_simple_kernel<<<
-      GET_BLOCKS(in_grad.shape.num_elements().int_from_positive_int()),
+      GET_BLOCKS(get_num_elements(in_grad.shape.dims).int_from_positive_int()),
       CUDA_NUM_THREADS,
       0,
-      stream>>>(in_grad.shape.num_elements().int_from_positive_int(),
+      stream>>>(get_num_elements(in_grad.shape.dims).int_from_positive_int(),
                 out_grad.get_float_ptr(),
                 in_grad.get_float_ptr(),
                 info,
-                1.0f /*beta*/);
+                /*beta=*/1.0f);
 }
 
 } // namespace Transpose
diff --git a/lib/kernels/src/cuda/optimizer_kernels.cu b/lib/kernels/src/cuda/optimizer_kernels.cu
index 2fce3c5db9..f457ec762e 100644
--- a/lib/kernels/src/cuda/optimizer_kernels.cu
+++ b/lib/kernels/src/cuda/optimizer_kernels.cu
@@ -15,7 +15,7 @@
 
 #include "internal/device.h"
 #include "kernels/nccl.h"
-#include "kernels/optimizer_kernels.h"
+#include "kernels/optimizer_kernels_gpu.h"
 #include "utils/exception.h"
 
 namespace FlexFlow {
@@ -43,7 +43,7 @@ __global__ void sgd_update(size_t count,
   }
 }
 
-__host__ void sgd_ps_update_task_gpu(ffStream_t stream,
+__host__ void gpu_sgd_ps_update_task(ffStream_t stream,
                                      float lr,
                                      float momentum,
                                      bool nesterov,
@@ -72,8 +72,7 @@ __host__ void sgd_ps_update_task_gpu(ffStream_t stream,
                                                                 weight_ptr);
 }
 
-#ifdef FF_USE_NCCL
-__host__ void sgd_nccl_update_task_gpu(ffStream_t stream,
+__host__ void gpu_sgd_nccl_update_task(ffStream_t stream,
                                        float lr,
                                        float momentum,
                                        bool nesterov,
@@ -92,7 +91,6 @@ __host__ void sgd_nccl_update_task_gpu(ffStream_t stream,
   sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
       size, lr, weight_decay, momentum, nesterov, w_grad_ptr, v_ptr, w_ptr);
 }
-#endif
 
 // ==================================================================
 //                        Adam Optimizer
@@ -134,7 +132,7 @@ __global__ void adam_update(int count,
   }
 }
 
-__host__ void adam_ps_update_task_gpu(ffStream_t stream,
+__host__ void gpu_adam_ps_update_task(ffStream_t stream,
                                       float alpha_t,
                                       float beta1,
                                       float beta2,
@@ -166,8 +164,7 @@ __host__ void adam_ps_update_task_gpu(ffStream_t stream,
                                                                  w_ptr);
 }
 
-#ifdef FF_USE_NCCL
-__host__ void adam_nccl_update_task_gpu(ffStream_t stream,
+__host__ void gpu_adam_nccl_update_task(ffStream_t stream,
                                         float alpha_t,
                                         float beta1,
                                         float beta2,
@@ -200,6 +197,5 @@ __host__ void adam_nccl_update_task_gpu(ffStream_t stream,
                                                                  v_ptr,
                                                                  w_ptr);
 }
-#endif
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/ff_handle.cc b/lib/kernels/src/ff_handle.cc
index 63ca6975fd..0ae8fdf81d 100644
--- a/lib/kernels/src/ff_handle.cc
+++ b/lib/kernels/src/ff_handle.cc
@@ -1,4 +1,5 @@
 #include "kernels/ff_handle.h"
+#include <fmt/format.h>
 
 namespace FlexFlow {
 
diff --git a/lib/kernels/src/internal/device.h b/lib/kernels/src/internal/device.h
index 226c7ad174..2e0495ed33 100644
--- a/lib/kernels/src/internal/device.h
+++ b/lib/kernels/src/internal/device.h
@@ -1,10 +1,11 @@
 #ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_INTERNAL_DEVICE_H
 #define _FLEXFLOW_LIB_KERNELS_INCLUDE_INTERNAL_DEVICE_H
 
-#include "kernels/array_shape.h"
 #include "kernels/device.h"
+#include "op-attrs/activation.dtg.h"
 #include "op-attrs/datatype.h"
 #include "op-attrs/operator_type.h"
+#include "op-attrs/tensor_shape.dtg.h"
 #include <cstddef>
 
 namespace FlexFlow {
@@ -131,8 +132,9 @@ __host__ void updateGAS(float *para_ptr,
 template <typename T>
 void print_tensor(T const *ptr, size_t num_elements, char const *prefix);
 
-ffStatus_t cudnnSetTensorDescriptorFromArrayShape(ffTensorDescriptor_t tensor,
-                                                  ArrayShape const &shape);
+ffStatus_t
+    cudnnSetTensorDescriptorFromTensorShape(ffTensorDescriptor_t tensor,
+                                            TensorShape const &tensor_shape);
 
 ffDataType_t ff_to_cuda_datatype(DataType type);
 
diff --git a/lib/kernels/src/kernels/accessor.cc b/lib/kernels/src/kernels/accessor.cc
index 5a1881eb66..868940bf6c 100644
--- a/lib/kernels/src/kernels/accessor.cc
+++ b/lib/kernels/src/kernels/accessor.cc
@@ -1,6 +1,9 @@
 #include "kernels/accessor.h"
 #include "kernels/allocation.h"
 #include "kernels/datatype_dispatch.h"
+#include "op-attrs/ff_ordered/get_idxs.h"
+#include "op-attrs/tensor_dims_coord.h"
+#include "op-attrs/tensor_shape.h"
 #include "utils/containers/reversed.h"
 #include "utils/containers/vector_of.h"
 #include "utils/nonnegative_int/nonnegative_range.h"
@@ -8,33 +11,42 @@
 
 namespace FlexFlow {
 
-nonnegative_int
-    calculate_accessor_offset(LegionOrdered<nonnegative_int> const &indices,
-                              ArrayShape const &shape) {
-  ASSERT(indices.size() == shape.num_dims(),
+nonnegative_int calculate_accessor_offset(TensorDimsCoord const &coord,
+                                          TensorDims const &tensor_dims) {
+  ASSERT(tensor_dims_coord_get_num_dims(coord) == get_num_dims(tensor_dims),
          "Number of indices does not match the number of dimensions");
 
   nonnegative_int offset = 0_n;
   positive_int multiplier = 1_p;
 
-  for (legion_dim_t dim : reversed(vector_of(key_range(shape.dims)))) {
-    ASSERT(indices.at(dim) < shape.at(legion_dim_t{dim}),
+  for (ff_dim_t dim : reversed(get_idxs(tensor_dims.ff_ordered))) {
+    ASSERT(coord.ff_ordered.at(dim) < dim_at_idx(tensor_dims, dim),
            "Out of bounds access",
            dim);
 
-    offset += indices.at(dim) * multiplier;
-    multiplier *= shape.at(legion_dim_t{dim});
+    offset += coord.ff_ordered.at(dim) * multiplier;
+    multiplier *= tensor_dims.ff_ordered.at(dim);
   }
 
   return offset;
 }
 
+TensorShape
+    get_tensor_shape_for_accessor_r(GenericTensorAccessorR const &accessor) {
+  return accessor.shape;
+}
+
+TensorShape
+    get_tensor_shape_for_accessor_w(GenericTensorAccessorW const &accessor) {
+  return accessor.shape;
+}
+
 void copy_accessor_data_to_l_from_r(
-    GenericTensorAccessorW &dst_accessor,
+    GenericTensorAccessorW const &dst_accessor,
     GenericTensorAccessorR const &src_accessor) {
-  size_t num_bytes =
-      dst_accessor.shape.num_elements().int_from_positive_int() *
-      size_of_datatype(dst_accessor.data_type).int_from_positive_int();
+  size_t num_bytes = get_size_in_bytes(dst_accessor.shape)
+                         .unwrap_num_bytes()
+                         .unwrap_nonnegative();
 
   DeviceType dst_device_type = dst_accessor.device_type;
   DeviceType src_device_type = src_accessor.device_type;
@@ -65,18 +77,14 @@ GenericTensorAccessorW::operator GenericTensorAccessorR() const {
 }
 
 GenericTensorAccessorW::GenericTensorAccessorW(
-    DataType data_type,
-    ArrayShape const &shape,
+    TensorShape const &shape,
     void *ptr,
     DeviceType device_type = DeviceType::GPU)
-    : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {}
+    : shape(shape), ptr(ptr), device_type(device_type) {}
 
-std::tuple<DataType const &,
-           ArrayShape const &,
-           void *const &,
-           DeviceType const &>
+std::tuple<TensorShape const &, void *const &, DeviceType const &>
     GenericTensorAccessorW::tie() const {
-  return std::tie(this->data_type, this->shape, this->ptr, this->device_type);
+  return std::tie(this->shape, this->ptr, this->device_type);
 }
 
 bool GenericTensorAccessorW::operator==(
@@ -110,10 +118,10 @@ half *GenericTensorAccessorW::get_half_ptr() const {
 }
 
 std::string format_as(GenericTensorAccessorW const &a) {
-  return fmt::format("<GenericTensorAccessorW data_type={} shape={} ptr={}>",
-                     a.data_type,
+  return fmt::format("<GenericTensorAccessorW shape={} ptr={} device_type={}>",
                      a.shape,
-                     a.ptr);
+                     a.ptr,
+                     a.device_type);
 }
 
 std::ostream &operator<<(std::ostream &s, GenericTensorAccessorW const &a) {
@@ -121,18 +129,14 @@ std::ostream &operator<<(std::ostream &s, GenericTensorAccessorW const &a) {
 }
 
 GenericTensorAccessorR::GenericTensorAccessorR(
-    DataType data_type,
-    ArrayShape const &shape,
+    TensorShape const &shape,
     void const *ptr,
     DeviceType device_type = DeviceType::GPU)
-    : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {}
+    : shape(shape), ptr(ptr), device_type(device_type) {}
 
-std::tuple<DataType const &,
-           ArrayShape const &,
-           void const *const &,
-           DeviceType const &>
+std::tuple<TensorShape const &, void const *const &, DeviceType const &>
     GenericTensorAccessorR::tie() const {
-  return std::tie(this->data_type, this->shape, this->ptr, this->device_type);
+  return std::tie(this->shape, this->ptr, this->device_type);
 }
 
 bool GenericTensorAccessorR::operator==(
@@ -166,10 +170,10 @@ half const *GenericTensorAccessorR::get_half_ptr() const {
 }
 
 std::string format_as(GenericTensorAccessorR const &a) {
-  return fmt::format("<GenericTensorAccessorR data_type={} shape={} ptr={}>",
-                     a.data_type,
+  return fmt::format("<GenericTensorAccessorR shape={} ptr={} device_type={}>",
                      a.shape,
-                     a.ptr);
+                     a.ptr,
+                     a.device_type);
 }
 
 std::ostream &operator<<(std::ostream &s, GenericTensorAccessorR const &a) {
@@ -269,45 +273,20 @@ std::vector<half *>
 GenericTensorAccessorR read_only_accessor_from_write_accessor(
     GenericTensorAccessorW const &writable) {
   return GenericTensorAccessorR{
-      writable.data_type,
       writable.shape,
       writable.ptr,
       writable.device_type,
   };
 }
 
-bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1,
-                              GenericTensorAccessorR const &acc2) {
-  return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type;
-}
-
-bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1,
-                              GenericTensorAccessorW const &acc2) {
-  return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type;
-}
-
-bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
-                             ArrayShape const &expected_shape,
-                             DataType const &expected_dtype) {
-  return accessor.shape == expected_shape &&
-         accessor.data_type == expected_dtype;
-}
-
-bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor,
-                             ArrayShape const &expected_shape,
-                             DataType const &expected_dtype) {
-  return accessor.shape == expected_shape &&
-         accessor.data_type == expected_dtype;
-}
-
-std::pair<ArrayShape, DataType>
-    get_shape_and_datatype(GenericTensorAccessorR const &accessor) {
-  return std::make_pair(accessor.shape, accessor.data_type);
+bool accessors_have_same_shape(GenericTensorAccessorR const &acc1,
+                               GenericTensorAccessorR const &acc2) {
+  return acc1.shape == acc2.shape;
 }
 
-std::pair<ArrayShape, DataType>
-    get_shape_and_datatype(GenericTensorAccessorW const &accessor) {
-  return std::make_pair(accessor.shape, accessor.data_type);
+bool accessors_have_same_shape(GenericTensorAccessorW const &acc1,
+                               GenericTensorAccessorW const &acc2) {
+  return acc1.shape == acc2.shape;
 }
 
 template int32_t
diff --git a/lib/kernels/src/kernels/allocation.cc b/lib/kernels/src/kernels/allocation.cc
index a6881d240a..93b86f1b6d 100644
--- a/lib/kernels/src/kernels/allocation.cc
+++ b/lib/kernels/src/kernels/allocation.cc
@@ -17,11 +17,10 @@ DeviceType Allocator::get_allocation_device_type() const {
 
 GenericTensorAccessorW
     Allocator::allocate_tensor(TensorShape const &tensor_shape) {
-  void *ptr =
-      this->allocate(get_size_in_bytes(tensor_shape).int_from_positive_int());
+  void *ptr = this->allocate(
+      get_size_in_bytes(tensor_shape).unwrap_num_bytes().unwrap_nonnegative());
   return GenericTensorAccessorW{
-      tensor_shape.data_type,
-      array_shape_from_tensor_shape(tensor_shape),
+      tensor_shape,
       ptr,
       this->get_allocation_device_type(),
   };
diff --git a/lib/kernels/src/kernels/array_shape.cc b/lib/kernels/src/kernels/array_shape.cc
deleted file mode 100644
index a1fb9bf09b..0000000000
--- a/lib/kernels/src/kernels/array_shape.cc
+++ /dev/null
@@ -1,150 +0,0 @@
-#include "kernels/array_shape.h"
-#include "kernels/legion_ordered/slice.h"
-#include "op-attrs/ff_ordered/ff_ordered_of.h"
-#include "op-attrs/ff_ordered/get_idxs.h"
-#include "op-attrs/ff_ordered/slice.h"
-#include "utils/containers/cartesian_product.h"
-#include "utils/containers/product.h"
-#include "utils/containers/reversed.h"
-#include "utils/containers/transform.h"
-#include "utils/containers/unordered_set_of.h"
-#include "utils/containers/vector_of.h"
-#include "utils/hash/tuple.h"
-#include "utils/hash/vector.h"
-#include "utils/nonnegative_int/nonnegative_range.h"
-#include "utils/nonnegative_int/num_elements.h"
-
-namespace FlexFlow {
-
-ArrayShape::ArrayShape(LegionOrdered<positive_int> const &input_dims)
-    : dims(input_dims) {}
-
-nonnegative_int ArrayShape::num_dims() const {
-  return ::FlexFlow::num_elements(this->dims);
-}
-
-positive_int ArrayShape::num_elements() const {
-  return product(this->dims);
-}
-
-positive_int ArrayShape::operator[](legion_dim_t idx) const {
-  return dims.at(idx);
-}
-
-positive_int ArrayShape::at(legion_dim_t idx) const {
-  return dims.at(idx);
-}
-
-positive_int ArrayShape::at(ff_dim_t idx) const {
-  return dims.at(legion_dim_from_ff_dim(idx, this->num_dims()));
-}
-
-bool ArrayShape::operator==(ArrayShape const &other) const {
-  return this->tie() == other.tie();
-}
-
-bool ArrayShape::operator!=(ArrayShape const &other) const {
-  return this->tie() != other.tie();
-}
-
-ArrayShape
-    ArrayShape::sub_shape(ff_dim_t const &start,
-                          std::optional<ff_dim_t> const &maybe_end) const {
-  FFOrdered<positive_int> ff_ordered_dims =
-      ff_ordered_from_legion_ordered(this->dims);
-  FFOrdered<positive_int> sliced = slice(ff_ordered_dims, start, maybe_end);
-  return ArrayShape{legion_ordered_from_ff_ordered(sliced)};
-}
-
-ArrayShape
-    ArrayShape::sub_shape(legion_dim_t const &start,
-                          std::optional<legion_dim_t> const &maybe_end) const {
-  return ArrayShape{slice(this->dims, start, maybe_end)};
-}
-
-std::optional<positive_int> ArrayShape::at_maybe(legion_dim_t index) const {
-  if (index.value < dims.size()) {
-    return dims.at(index);
-  } else {
-    return std::nullopt;
-  }
-}
-
-std::optional<positive_int> ArrayShape::at_maybe(ff_dim_t index) const {
-  return this->at_maybe(legion_dim_from_ff_dim(index, this->num_dims()));
-}
-
-std::tuple<LegionOrdered<positive_int> const &> ArrayShape::tie() const {
-  return std::tie(this->dims);
-}
-
-std::string format_as(ArrayShape const &x) {
-  std::ostringstream oss;
-  oss << "<ArrayShape";
-  oss << " dims=" << x.dims;
-  oss << ">";
-  return oss.str();
-}
-
-std::ostream &operator<<(std::ostream &s, ArrayShape const &x) {
-  return (s << fmt::to_string(x));
-}
-
-positive_int get_num_elements(ArrayShape const &shape) {
-  return shape.num_elements();
-}
-
-ArrayShape array_shape_from_tensor_shape(TensorShape const &tensor_shape) {
-  return ArrayShape{
-      legion_ordered_from_ff_ordered(tensor_shape.dims.ff_ordered)};
-}
-
-TensorShape get_tensor_shape(ArrayShape const &shape, DataType dtype) {
-  return TensorShape{TensorDims{ff_ordered_from_legion_ordered(shape.dims)},
-                     dtype};
-}
-
-std::unordered_set<ff_dim_t> get_ff_dim_t_set(ArrayShape const &shape) {
-  return unordered_set_of(get_idxs(ff_ordered_from_legion_ordered(shape.dims)));
-}
-
-std::unordered_set<ArrayCoord> get_array_coord_set(ArrayShape const &shape) {
-  std::vector<std::vector<nonnegative_int>> per_dim_ranges = transform(
-      vector_of(ff_ordered_from_legion_ordered(shape.dims)),
-      [](positive_int dim_size) -> std::vector<nonnegative_int> {
-        return nonnegative_range(dim_size.nonnegative_int_from_positive_int());
-      });
-
-  std::unordered_set<std::vector<nonnegative_int>> raw_points =
-      unordered_set_of(cartesian_product(per_dim_ranges));
-
-  return transform(raw_points,
-                   [](std::vector<nonnegative_int> const &raw_point) {
-                     return ArrayCoord{ff_ordered_of(raw_point)};
-                   });
-}
-
-ArrayShape array_shape_drop_dims(
-    ArrayShape const &shape,
-    std::function<bool(ff_dim_t)> const &should_drop_dim) {
-  std::vector<positive_int> result;
-  for (ff_dim_t idx : get_idxs(ff_ordered_from_legion_ordered(shape.dims))) {
-    if (!should_drop_dim(idx)) {
-      result.push_back(shape.at(idx));
-    }
-  }
-
-  return ArrayShape{legion_ordered_from_ff_ordered(ff_ordered_of(result))};
-}
-
-} // namespace FlexFlow
-
-namespace std {
-
-using namespace FlexFlow;
-
-size_t hash<ArrayShape>::operator()(ArrayShape const &s) const {
-  return get_std_hash(s.tie());
-}
-
-} // namespace std
diff --git a/lib/kernels/src/kernels/attention_kernels.cc b/lib/kernels/src/kernels/attention_kernels.cc
new file mode 100644
index 0000000000..f3b024d7c9
--- /dev/null
+++ b/lib/kernels/src/kernels/attention_kernels.cc
@@ -0,0 +1,125 @@
+#include "kernels/attention_kernels.h"
+#include "kernels/attention_kernels_cpu.h"
+#include "kernels/attention_kernels_gpu.h"
+
+namespace FlexFlow::Kernels::MultiHeadAttention {
+
+std::optional<MHAPerDeviceState>
+    init_kernel(DeviceType device_type,
+                device_handle_t const &per_device_ff_handle,
+                Allocator &allocator,
+                int num_samples,
+                int num_heads,
+                int qSize,
+                int kSize,
+                int vSize,
+                int qProjSize,
+                int kProjSize,
+                int vProjSize,
+                int oProjSize,
+                int qoSeqLength,
+                int kvSeqLength,
+                bool add_bias_kv) {
+  if (device_type == DeviceType::GPU) {
+    return gpu_init_kernel(
+        /*per_device_ff_handle=*/per_device_ff_handle.require_for_gpu(),
+        /*allocator=*/allocator,
+        /*num_samples=*/num_samples,
+        /*num_heads=*/num_heads,
+        /*qSize=*/qSize,
+        /*kSize=*/kSize,
+        /*vSize=*/vSize,
+        /*qProjSize=*/qProjSize,
+        /*kProjSize=*/kProjSize,
+        /*vProjSize=*/vProjSize,
+        /*oProjSize=*/oProjSize,
+        /*qoSeqLength=*/qoSeqLength,
+        /*kvSeqLength=*/kvSeqLength,
+        /*add_bias_kv=*/add_bias_kv);
+  } else {
+    ASSERT(per_device_ff_handle.is_for_cpu());
+    ASSERT(device_type == DeviceType::CPU);
+    return std::nullopt;
+  }
+}
+
+void forward_kernel(device_stream_t const &stream,
+                    std::optional<MHAPerDeviceState> const &device_state,
+                    float const *query_ptr,
+                    float const *key_ptr,
+                    float const *value_ptr,
+                    float const *weight_ptr,
+                    float *output_ptr) {
+  if (stream.is_gpu()) {
+    gpu_forward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*device_state=*/device_state.value(),
+        /*query_ptr=*/query_ptr,
+        /*key_ptr=*/key_ptr,
+        /*value_ptr=*/value_ptr,
+        /*weight_ptr=*/weight_ptr,
+        /*output_ptr=*/output_ptr);
+  } else {
+    ASSERT(stream.is_cpu());
+    ASSERT(device_state == std::nullopt);
+    cpu_forward_kernel(
+        /*query_ptr=*/query_ptr,
+        /*key_ptr=*/key_ptr,
+        /*value_ptr=*/value_ptr,
+        /*weight_ptr=*/weight_ptr,
+        /*output_ptr=*/output_ptr);
+  }
+}
+
+void backward_kernel(device_stream_t const &stream,
+                     std::optional<MHAPerDeviceState> const &device_state,
+                     float const *query_ptr,
+                     float *query_grad_ptr,
+                     float const *key_ptr,
+                     float *key_grad_ptr,
+                     float const *value_ptr,
+                     float *value_grad_ptr,
+                     float const *weight_ptr,
+                     float *weight_grad_ptr,
+                     float const *output_grad_ptr) {
+  if (stream.is_gpu()) {
+    gpu_backward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*device_state=*/device_state.value(),
+        /*query_ptr=*/query_ptr,
+        /*query_grad_ptr=*/query_grad_ptr,
+        /*key_ptr=*/key_ptr,
+        /*key_grad_ptr=*/key_grad_ptr,
+        /*value_ptr=*/value_ptr,
+        /*value_grad_ptr=*/value_grad_ptr,
+        /*weight_ptr=*/weight_ptr,
+        /*weight_grad_ptr=*/weight_grad_ptr,
+        /*output_grad_ptr=*/output_grad_ptr);
+  } else {
+    ASSERT(stream.is_cpu());
+    ASSERT(device_state == std::nullopt);
+    cpu_backward_kernel(
+        /*query_ptr=*/query_ptr,
+        /*query_grad_ptr=*/query_grad_ptr,
+        /*key_ptr=*/key_ptr,
+        /*key_grad_ptr=*/key_grad_ptr,
+        /*value_ptr=*/value_ptr,
+        /*value_grad_ptr=*/value_grad_ptr,
+        /*weight_ptr=*/weight_ptr,
+        /*weight_grad_ptr=*/weight_grad_ptr,
+        /*output_grad_ptr=*/output_grad_ptr);
+  }
+}
+
+void cleanup_kernel(DeviceType device_type,
+                    Allocator &allocator,
+                    std::optional<MHAPerDeviceState> const &device_state) {
+  if (device_type == DeviceType::GPU) {
+    gpu_cleanup_kernel(allocator, device_state.value());
+  } else {
+    ASSERT(device_type == DeviceType::CPU);
+    ASSERT(device_state == std::nullopt);
+  }
+}
+
+} // namespace FlexFlow::Kernels::MultiHeadAttention
diff --git a/lib/kernels/src/kernels/attention_kernels_cpu.cc b/lib/kernels/src/kernels/attention_kernels_cpu.cc
new file mode 100644
index 0000000000..5af254fa5e
--- /dev/null
+++ b/lib/kernels/src/kernels/attention_kernels_cpu.cc
@@ -0,0 +1,25 @@
+#include "kernels/attention_kernels_cpu.h"
+
+namespace FlexFlow::Kernels::MultiHeadAttention {
+
+void cpu_forward_kernel(float const *query_ptr,
+                        float const *key_ptr,
+                        float const *value_ptr,
+                        float const *weight_ptr,
+                        float *output_ptr) {
+  NOT_IMPLEMENTED();
+}
+
+void cpu_backward_kernel(float const *query_ptr,
+                         float *query_grad_ptr,
+                         float const *key_ptr,
+                         float *key_grad_ptr,
+                         float const *value_ptr,
+                         float *value_grad_ptr,
+                         float const *weight_ptr,
+                         float *weight_grad_ptr,
+                         float const *output_grad_ptr) {
+  NOT_IMPLEMENTED();
+}
+
+} // namespace FlexFlow::Kernels::MultiHeadAttention
diff --git a/lib/kernels/src/kernels/batch_matmul_kernels.cc b/lib/kernels/src/kernels/batch_matmul_kernels.cc
new file mode 100644
index 0000000000..652d4fb137
--- /dev/null
+++ b/lib/kernels/src/kernels/batch_matmul_kernels.cc
@@ -0,0 +1,93 @@
+#include "kernels/batch_matmul_kernels.h"
+#include "kernels/batch_matmul_kernels_cpu.h"
+#include "kernels/batch_matmul_kernels_gpu.h"
+
+namespace FlexFlow::Kernels::BatchMatmul {
+
+void forward_kernel(device_stream_t const &stream,
+                    device_handle_t const &handle,
+                    float *output_ptr,
+                    float const *a_input_ptr,
+                    float const *b_input_ptr,
+                    int m,
+                    int n,
+                    int k,
+                    int batch,
+                    int seq_length,
+                    int a_seq_length_dim,
+                    int b_seq_length_dim) {
+  if (stream.is_gpu()) {
+    gpu_forward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*handle=*/handle.require_for_gpu(),
+        /*output_ptr=*/output_ptr,
+        /*a_input_ptr=*/a_input_ptr,
+        /*b_input_ptr=*/b_input_ptr,
+        /*m=*/m,
+        /*n=*/n,
+        /*k=*/k,
+        /*batch=*/batch,
+        /*seq_length=*/seq_length,
+        /*a_seq_length_dim=*/a_seq_length_dim,
+        /*b_seq_length_dim=*/b_seq_length_dim);
+  } else {
+    ASSERT(stream.is_cpu());
+    ASSERT(handle.is_for_cpu());
+    cpu_forward_kernel(
+        /*output_ptr=*/output_ptr,
+        /*a_input_ptr=*/a_input_ptr,
+        /*b_input_ptr=*/b_input_ptr,
+        /*m=*/m,
+        /*n=*/n,
+        /*k=*/k,
+        /*batch=*/batch,
+        /*seq_length=*/seq_length,
+        /*a_seq_length_dim=*/a_seq_length_dim,
+        /*b_seq_length_dim=*/b_seq_length_dim);
+  }
+}
+
+void backward_kernel(device_stream_t const &stream,
+                     device_handle_t const &handle,
+                     float const *o_ptr,
+                     float const *o_grad_ptr,
+                     float const *a_ptr,
+                     float *a_grad_ptr,
+                     float const *b_ptr,
+                     float *b_grad_ptr,
+                     int m,
+                     int n,
+                     int k,
+                     int batch) {
+  if (stream.is_gpu()) {
+    gpu_backward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*handle=*/handle.require_for_gpu(),
+        /*o_ptr=*/o_ptr,
+        /*o_grad_ptr=*/o_grad_ptr,
+        /*a_ptr=*/a_ptr,
+        /*a_grad_ptr=*/a_grad_ptr,
+        /*b_ptr=*/b_ptr,
+        /*b_grad_ptr=*/b_grad_ptr,
+        /*m=*/m,
+        /*n=*/n,
+        /*k=*/k,
+        /*batch=*/batch);
+  } else {
+    ASSERT(stream.is_cpu());
+    ASSERT(handle.is_for_cpu());
+    cpu_backward_kernel(
+        /*o_ptr=*/o_ptr,
+        /*o_grad_ptr=*/o_grad_ptr,
+        /*a_ptr=*/a_ptr,
+        /*a_grad_ptr=*/a_grad_ptr,
+        /*b_ptr=*/b_ptr,
+        /*b_grad_ptr=*/b_grad_ptr,
+        /*m=*/m,
+        /*n=*/n,
+        /*k=*/k,
+        /*batch=*/batch);
+  }
+}
+
+} // namespace FlexFlow::Kernels::BatchMatmul
diff --git a/lib/kernels/src/kernels/batch_matmul_kernels_cpu.cc b/lib/kernels/src/kernels/batch_matmul_kernels_cpu.cc
new file mode 100644
index 0000000000..f139d42992
--- /dev/null
+++ b/lib/kernels/src/kernels/batch_matmul_kernels_cpu.cc
@@ -0,0 +1,31 @@
+#include "kernels/batch_matmul_kernels_cpu.h"
+
+namespace FlexFlow::Kernels::BatchMatmul {
+
+void cpu_forward_kernel(float *output_ptr,
+                        float const *a_input_ptr,
+                        float const *b_input_ptr,
+                        int m,
+                        int n,
+                        int k,
+                        int batch,
+                        int seq_length,
+                        int a_seq_length_dim,
+                        int b_seq_length_dim) {
+  NOT_IMPLEMENTED();
+}
+
+void cpu_backward_kernel(float const *o_ptr,
+                         float const *o_grad_ptr,
+                         float const *a_ptr,
+                         float *a_grad_ptr,
+                         float const *b_ptr,
+                         float *b_grad_ptr,
+                         int m,
+                         int n,
+                         int k,
+                         int batch) {
+  NOT_IMPLEMENTED();
+}
+
+} // namespace FlexFlow::Kernels::BatchMatmul
diff --git a/lib/kernels/src/kernels/batch_norm_kernels.cc b/lib/kernels/src/kernels/batch_norm_kernels.cc
new file mode 100644
index 0000000000..e23f6a89e2
--- /dev/null
+++ b/lib/kernels/src/kernels/batch_norm_kernels.cc
@@ -0,0 +1,107 @@
+#include "kernels/batch_norm_kernels.h"
+#include "kernels/batch_norm_kernels_cpu.h"
+#include "kernels/batch_norm_kernels_gpu.h"
+
+namespace FlexFlow::Kernels::BatchNorm {
+
+std::optional<BatchNormPerDeviceState>
+    init_kernel(DeviceType device_type,
+                device_handle_t const &handle,
+                Allocator &allocator,
+                float *runningMean,
+                int output_n,
+                int output_c,
+                int output_h,
+                int output_w,
+                bool relu) {
+  if (device_type == DeviceType::GPU) {
+    return gpu_init_kernel(
+        /*handle=*/handle.require_for_gpu(),
+        /*allocator=*/allocator,
+        /*runningMean=*/runningMean,
+        /*output_n=*/output_n,
+        /*output_c=*/output_c,
+        /*output_h=*/output_h,
+        /*output_w=*/output_w,
+        /*relu=*/relu);
+  } else {
+    ASSERT(device_type == DeviceType::CPU);
+    ASSERT(handle.is_for_cpu());
+    return std::nullopt;
+  }
+}
+
+void forward_kernel(device_stream_t const &stream,
+                    BatchNormPerDeviceState const &per_device_state,
+                    float const *input_ptr,
+                    float *output_ptr,
+                    float const *scale_ptr,
+                    float const *bias_ptr) {
+  if (stream.is_gpu()) {
+    gpu_forward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*per_device_state=*/per_device_state,
+        /*input_ptr=*/input_ptr,
+        /*output_ptr=*/output_ptr,
+        /*scale_ptr=*/scale_ptr,
+        /*bias_ptr=*/bias_ptr);
+  } else {
+    ASSERT(stream.is_cpu());
+    cpu_forward_kernel(
+        /*per_device_state=*/per_device_state,
+        /*input_ptr=*/input_ptr,
+        /*output_ptr=*/output_ptr,
+        /*scale_ptr=*/scale_ptr,
+        /*bias_ptr=*/bias_ptr);
+  }
+}
+
+void backward_kernel(device_stream_t const &stream,
+                     BatchNormPerDeviceState const &per_device_state,
+                     float const *output_ptr,
+                     float *output_grad_ptr,
+                     float const *input_ptr,
+                     float *input_grad_ptr,
+                     float const *scale_ptr,
+                     float *scale_grad_ptr,
+                     float *bias_grad_ptr,
+                     size_t numElements) {
+  if (stream.is_gpu()) {
+    gpu_backward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*per_device_state=*/per_device_state,
+        /*output_ptr=*/output_ptr,
+        /*output_grad_ptr=*/output_grad_ptr,
+        /*input_ptr=*/input_ptr,
+        /*input_grad_ptr=*/input_grad_ptr,
+        /*scale_ptr=*/scale_ptr,
+        /*scale_grad_ptr=*/scale_grad_ptr,
+        /*bias_grad_ptr=*/bias_grad_ptr,
+        /*numElements=*/numElements);
+  } else {
+    ASSERT(stream.is_cpu());
+    cpu_backward_kernel(
+        /*per_device_state=*/per_device_state,
+        /*output_ptr=*/output_ptr,
+        /*output_grad_ptr=*/output_grad_ptr,
+        /*input_ptr=*/input_ptr,
+        /*input_grad_ptr=*/input_grad_ptr,
+        /*scale_ptr=*/scale_ptr,
+        /*scale_grad_ptr=*/scale_grad_ptr,
+        /*bias_grad_ptr=*/bias_grad_ptr,
+        /*numElements=*/numElements);
+  }
+}
+
+void cleanup_kernel(DeviceType device_type,
+                    Allocator &allocator,
+                    std::optional<BatchNormPerDeviceState> &per_device_state) {
+  if (device_type == DeviceType::GPU) {
+    gpu_cleanup_kernel(allocator, per_device_state.value());
+  } else {
+    ASSERT(device_type == DeviceType::CPU);
+    ASSERT(per_device_state == std::nullopt);
+  }
+}
+
+} // namespace FlexFlow::Kernels::BatchNorm
diff --git a/lib/kernels/src/kernels/batch_norm_kernels_cpu.cc b/lib/kernels/src/kernels/batch_norm_kernels_cpu.cc
new file mode 100644
index 0000000000..be440ac4ea
--- /dev/null
+++ b/lib/kernels/src/kernels/batch_norm_kernels_cpu.cc
@@ -0,0 +1,25 @@
+#include "kernels/batch_norm_kernels_cpu.h"
+
+namespace FlexFlow::Kernels::BatchNorm {
+
+void cpu_forward_kernel(BatchNormPerDeviceState const &per_device_state,
+                        float const *input_ptr,
+                        float *output_ptr,
+                        float const *scale_ptr,
+                        float const *bias_ptr) {
+  NOT_IMPLEMENTED();
+}
+
+void cpu_backward_kernel(BatchNormPerDeviceState const &per_device_state,
+                         float const *output_ptr,
+                         float *output_grad_ptr,
+                         float const *input_ptr,
+                         float *input_grad_ptr,
+                         float const *scale_ptr,
+                         float *scale_grad_ptr,
+                         float *bias_grad_ptr,
+                         size_t numElements) {
+  NOT_IMPLEMENTED();
+}
+
+} // namespace FlexFlow::Kernels::BatchNorm
diff --git a/lib/kernels/src/kernels/cast_kernels.cc b/lib/kernels/src/kernels/cast_kernels.cc
new file mode 100644
index 0000000000..2c668c42b2
--- /dev/null
+++ b/lib/kernels/src/kernels/cast_kernels.cc
@@ -0,0 +1,39 @@
+#include "kernels/cast_kernels.h"
+#include "kernels/cast_kernels_cpu.h"
+#include "kernels/cast_kernels_gpu.h"
+
+namespace FlexFlow::Kernels::Cast {
+
+void forward_kernel(device_stream_t const &stream,
+                    GenericTensorAccessorR const &input,
+                    GenericTensorAccessorW const &output) {
+  if (stream.is_gpu()) {
+    gpu_forward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*input=*/input,
+        /*output=*/output);
+  } else {
+    ASSERT(stream.is_cpu());
+    cpu_forward_kernel(
+        /*input=*/input,
+        /*output=*/output);
+  }
+}
+
+void backward_kernel(device_stream_t const &stream,
+                     GenericTensorAccessorR const &output_grad,
+                     GenericTensorAccessorW const &input_grad) {
+  if (stream.is_gpu()) {
+    gpu_backward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*output_grad=*/output_grad,
+        /*input_grad=*/input_grad);
+  } else {
+    ASSERT(stream.is_cpu());
+    cpu_backward_kernel(
+        /*output_grad=*/output_grad,
+        /*input_grad=*/input_grad);
+  }
+}
+
+} // namespace FlexFlow::Kernels::Cast
diff --git a/lib/kernels/src/cpu/ops/cast_kernels.cc b/lib/kernels/src/kernels/cast_kernels_cpu.cc
similarity index 82%
rename from lib/kernels/src/cpu/ops/cast_kernels.cc
rename to lib/kernels/src/kernels/cast_kernels_cpu.cc
index 08a98f165b..f943fa142b 100644
--- a/lib/kernels/src/cpu/ops/cast_kernels.cc
+++ b/lib/kernels/src/kernels/cast_kernels_cpu.cc
@@ -21,7 +21,7 @@ template <DataType IDT, DataType ODT>
 struct CPUForwardKernel {
   void operator()(GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &output) {
-    size_t volume = input.shape.num_elements().int_from_positive_int();
+    size_t volume = get_num_elements(input.shape.dims).int_from_positive_int();
     cpu_cast_forward(input.get<IDT>(), output.get<ODT>(), volume);
   }
 };
@@ -30,7 +30,7 @@ template <DataType IDT, DataType ODT>
 struct CPUBackwardKernel {
   void operator()(GenericTensorAccessorR const &output,
                   GenericTensorAccessorW const &input) {
-    size_t volume = output.shape.num_elements().int_from_positive_int();
+    size_t volume = get_num_elements(output.shape.dims).int_from_positive_int();
     cpu_cast_backward(
         output.get<IDT>(), input.get<ODT>(), volume, cast_to<ODT>(1.0f));
   }
@@ -39,13 +39,13 @@ struct CPUBackwardKernel {
 void cpu_forward_kernel(GenericTensorAccessorR const &input,
                         GenericTensorAccessorW const &output) {
   DataTypeDispatch2<CPUForwardKernel>{}(
-      input.data_type, output.data_type, input, output);
+      input.shape.data_type, output.shape.data_type, input, output);
 }
 
 void cpu_backward_kernel(GenericTensorAccessorR const &output,
                          GenericTensorAccessorW const &input) {
   DataTypeDispatch2<CPUBackwardKernel>{}(
-      output.data_type, input.data_type, output, input);
+      output.shape.data_type, input.shape.data_type, output, input);
 }
 
 } // namespace FlexFlow::Kernels::Cast
diff --git a/lib/kernels/src/kernels/concat_kernels.cc b/lib/kernels/src/kernels/concat_kernels.cc
new file mode 100644
index 0000000000..8d0c3112aa
--- /dev/null
+++ b/lib/kernels/src/kernels/concat_kernels.cc
@@ -0,0 +1,45 @@
+#include "kernels/concat_kernels.h"
+#include "kernels/concat_kernels_cpu.h"
+#include "kernels/concat_kernels_gpu.h"
+
+namespace FlexFlow::Kernels::Concat {
+
+void forward_kernel(device_stream_t const &stream,
+                    GenericTensorAccessorW const &output,
+                    std::vector<GenericTensorAccessorR> const &inputs,
+                    ff_dim_t axis) {
+  if (stream.is_gpu()) {
+    gpu_forward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*output=*/output,
+        /*inputs=*/inputs,
+        /*axis=*/axis);
+  } else {
+    ASSERT(stream.is_cpu());
+    cpu_forward_kernel(
+        /*output=*/output,
+        /*inputs=*/inputs,
+        /*axis=*/axis);
+  }
+}
+
+void backward_kernel(device_stream_t const &stream,
+                     GenericTensorAccessorR const &output_grad,
+                     std::vector<GenericTensorAccessorW> const &input_grads,
+                     ff_dim_t axis) {
+  if (stream.is_gpu()) {
+    gpu_backward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*output_grad=*/output_grad,
+        /*input_grads=*/input_grads,
+        /*axis=*/axis);
+  } else {
+    ASSERT(stream.is_cpu());
+    cpu_backward_kernel(
+        /*output_grad=*/output_grad,
+        /*input_grads=*/input_grads,
+        /*axis=*/axis);
+  }
+}
+
+} // namespace FlexFlow::Kernels::Concat
diff --git a/lib/kernels/src/kernels/concat_kernels_cpu.cc b/lib/kernels/src/kernels/concat_kernels_cpu.cc
new file mode 100644
index 0000000000..03bbff67bb
--- /dev/null
+++ b/lib/kernels/src/kernels/concat_kernels_cpu.cc
@@ -0,0 +1,17 @@
+#include "kernels/concat_kernels_cpu.h"
+
+namespace FlexFlow::Kernels::Concat {
+
+void cpu_forward_kernel(GenericTensorAccessorW const &output,
+                        std::vector<GenericTensorAccessorR> const &inputs,
+                        ff_dim_t axis) {
+  NOT_IMPLEMENTED();
+}
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output_grad,
+                         std::vector<GenericTensorAccessorW> const &input_grads,
+                         ff_dim_t axis) {
+  NOT_IMPLEMENTED();
+}
+
+} // namespace FlexFlow::Kernels::Concat
diff --git a/lib/kernels/src/kernels/conv_2d_kernels.cc b/lib/kernels/src/kernels/conv_2d_kernels.cc
new file mode 100644
index 0000000000..3008e7d1c0
--- /dev/null
+++ b/lib/kernels/src/kernels/conv_2d_kernels.cc
@@ -0,0 +1,118 @@
+#include "kernels/conv_2d_kernels.h"
+#include "kernels/conv_2d_kernels_cpu.h"
+#include "kernels/conv_2d_kernels_gpu.h"
+
+namespace FlexFlow::Kernels::Conv2D {
+
+std::optional<Conv2DPerDeviceState>
+    init_kernel(DeviceType device_type,
+                device_handle_t const &handle,
+                std::optional<Activation> activation,
+                int kernel_h,
+                int kernel_w,
+                int groups,
+                int padding_h,
+                int padding_w,
+                int stride_h,
+                int stride_w,
+                GenericTensorAccessorW const &input,
+                GenericTensorAccessorW const &output,
+                float const *filter_ptr,
+                float *filter_grad_ptr) {
+  if (device_type == DeviceType::GPU) {
+    return gpu_init_kernel(
+        /*handle=*/handle.require_for_gpu(),
+        /*activation=*/activation,
+        /*kernel_h=*/kernel_h,
+        /*kernel_w=*/kernel_w,
+        /*groups=*/groups,
+        /*padding_h=*/padding_h,
+        /*padding_w=*/padding_w,
+        /*stride_h=*/stride_h,
+        /*stride_w=*/stride_w,
+        /*input=*/input,
+        /*output=*/output,
+        /*filter_ptr=*/filter_ptr,
+        /*filter_grad_ptr=*/filter_grad_ptr);
+  } else {
+    ASSERT(device_type == DeviceType::CPU);
+    ASSERT(handle.is_for_cpu());
+    return std::nullopt;
+  }
+}
+
+void forward_kernel(device_stream_t const &stream,
+                    std::optional<Conv2DPerDeviceState> const &per_device_state,
+                    float const *input_ptr,
+                    float *output_ptr,
+                    float const *filter_ptr,
+                    float const *bias_ptr,
+                    std::optional<Activation> activation) {
+  if (stream.is_gpu()) {
+    gpu_forward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*per_device_state=*/per_device_state.value(),
+        /*input_ptr=*/input_ptr,
+        /*output_ptr=*/output_ptr,
+        /*filter_ptr=*/filter_ptr,
+        /*bias_ptr=*/bias_ptr,
+        /*activation=*/activation);
+  } else {
+    ASSERT(stream.is_cpu());
+    cpu_forward_kernel(
+        /*input_ptr=*/input_ptr,
+        /*output_ptr=*/output_ptr,
+        /*filter_ptr=*/filter_ptr,
+        /*bias_ptr=*/bias_ptr,
+        /*activation=*/activation);
+  }
+}
+
+void backward_kernel(
+    device_stream_t const &stream,
+    std::optional<Conv2DPerDeviceState> const &per_device_state,
+    float const *output_ptr,
+    float *output_grad_ptr,
+    float const *input_ptr,
+    float *input_grad_ptr,
+    float const *filter_ptr,
+    float *filter_grad_ptr,
+    float *bias_grad_ptr,
+    std::optional<Activation> activation) {
+  if (stream.is_gpu()) {
+    gpu_backward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*per_device_state=*/per_device_state.value(),
+        /*output_ptr=*/output_ptr,
+        /*output_grad_ptr=*/output_grad_ptr,
+        /*input_ptr=*/input_ptr,
+        /*input_grad_ptr=*/input_grad_ptr,
+        /*filter_ptr=*/filter_ptr,
+        /*filter_grad_ptr=*/filter_grad_ptr,
+        /*bias_grad_ptr=*/bias_grad_ptr,
+        /*activation=*/activation);
+  } else {
+    ASSERT(stream.is_cpu());
+    cpu_backward_kernel(
+        /*output_ptr=*/output_ptr,
+        /*output_grad_ptr=*/output_grad_ptr,
+        /*input_ptr=*/input_ptr,
+        /*input_grad_ptr=*/input_grad_ptr,
+        /*filter_ptr=*/filter_ptr,
+        /*filter_grad_ptr=*/filter_grad_ptr,
+        /*bias_grad_ptr=*/bias_grad_ptr,
+        /*activation=*/activation);
+  }
+}
+
+void cleanup_kernel(DeviceType device_type,
+                    std::optional<Conv2DPerDeviceState> &per_device_state) {
+  if (device_type == DeviceType::GPU) {
+    gpu_cleanup_kernel(per_device_state.value());
+  } else {
+    ASSERT(device_type == DeviceType::CPU);
+    ASSERT(per_device_state == std::nullopt);
+  }
+}
+
+} // namespace FlexFlow::Kernels::Conv2D
diff --git a/lib/kernels/src/kernels/conv_2d_kernels_cpu.cc b/lib/kernels/src/kernels/conv_2d_kernels_cpu.cc
new file mode 100644
index 0000000000..c595ecb586
--- /dev/null
+++ b/lib/kernels/src/kernels/conv_2d_kernels_cpu.cc
@@ -0,0 +1,25 @@
+#include "kernels/conv_2d_kernels_cpu.h"
+#include "utils/exception.h"
+
+namespace FlexFlow::Kernels::Conv2D {
+
+void cpu_forward_kernel(float const *input_ptr,
+                        float *output_ptr,
+                        float const *filter_ptr,
+                        float const *bias_ptr,
+                        std::optional<Activation> const &activation) {
+  NOT_IMPLEMENTED();
+}
+
+void cpu_backward_kernel(float const *output_ptr,
+                         float *output_grad_ptr,
+                         float const *input_ptr,
+                         float *input_grad_ptr,
+                         float const *filter_ptr,
+                         float *filter_grad_ptr,
+                         float *bias_grad_ptr,
+                         std::optional<Activation> const &activation) {
+  NOT_IMPLEMENTED();
+}
+
+} // namespace FlexFlow::Kernels::Conv2D
diff --git a/lib/kernels/src/kernels/copy_tensor_accessor.cc b/lib/kernels/src/kernels/copy_tensor_accessor.cc
index d8619d8ce6..2989a3d227 100644
--- a/lib/kernels/src/kernels/copy_tensor_accessor.cc
+++ b/lib/kernels/src/kernels/copy_tensor_accessor.cc
@@ -7,9 +7,8 @@ template <DataType DT>
 struct CopyTensorAccessorW {
   GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor,
                                     Allocator &allocator) {
-    TensorShape shape =
-        get_tensor_shape(src_accessor.shape, src_accessor.data_type);
-    GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
+    GenericTensorAccessorW dst_accessor =
+        allocator.allocate_tensor(src_accessor.shape);
 
     copy_accessor_data_to_l_from_r(dst_accessor, src_accessor);
 
@@ -21,28 +20,27 @@ GenericTensorAccessorW
     copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor,
                            Allocator &allocator) {
   return DataTypeDispatch1<CopyTensorAccessorW>{}(
-      src_accessor.data_type, src_accessor, allocator);
+      src_accessor.shape.data_type, src_accessor, allocator);
 }
 
 template <DataType DT>
 struct CopyTensorAccessorR {
-  GenericTensorAccessorR operator()(GenericTensorAccessorR const &src_accessor,
+  GenericTensorAccessorW operator()(GenericTensorAccessorR const &src_accessor,
                                     Allocator &allocator) {
-    TensorShape shape =
-        get_tensor_shape(src_accessor.shape, src_accessor.data_type);
-    GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape);
+    GenericTensorAccessorW dst_accessor =
+        allocator.allocate_tensor(src_accessor.shape);
 
     copy_accessor_data_to_l_from_r(dst_accessor, src_accessor);
 
-    return read_only_accessor_from_write_accessor(dst_accessor);
+    return dst_accessor;
   }
 };
 
-GenericTensorAccessorR
+GenericTensorAccessorW
     copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor,
                            Allocator &allocator) {
   return DataTypeDispatch1<CopyTensorAccessorR>{}(
-      src_accessor.data_type, src_accessor, allocator);
+      src_accessor.shape.data_type, src_accessor, allocator);
 }
 
 GenericTensorAccessorR copy_tensor_accessor_r_to_cpu_if_necessary(
diff --git a/lib/kernels/src/kernels/create_local_allocator_for_device_type.cc b/lib/kernels/src/kernels/create_local_allocator_for_device_type.cc
new file mode 100644
index 0000000000..ea5f054d1b
--- /dev/null
+++ b/lib/kernels/src/kernels/create_local_allocator_for_device_type.cc
@@ -0,0 +1,16 @@
+#include "kernels/create_local_allocator_for_device_type.h"
+#include "kernels/local_cpu_allocator.h"
+#include "kernels/local_cuda_allocator.h"
+
+namespace FlexFlow {
+
+Allocator create_local_allocator_for_device_type(DeviceType device_type) {
+  if (device_type == DeviceType::GPU) {
+    return create_local_cuda_memory_allocator();
+  } else {
+    ASSERT(device_type == DeviceType::CPU);
+    return create_local_cpu_memory_allocator();
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/device_handle_t.cc b/lib/kernels/src/kernels/device_handle_t.cc
new file mode 100644
index 0000000000..85f9e2a388
--- /dev/null
+++ b/lib/kernels/src/kernels/device_handle_t.cc
@@ -0,0 +1,24 @@
+#include "kernels/device_handle_t.h"
+
+namespace FlexFlow {
+
+device_handle_t device_handle_t_from_managed_handle(
+    std::optional<ManagedPerDeviceFFHandle> const &managed_handle) {
+  if (managed_handle.has_value()) {
+    return gpu_make_device_handle_t(managed_handle.value().raw_handle());
+  } else {
+    return cpu_make_device_handle_t();
+  }
+}
+
+device_handle_t gpu_make_device_handle_t(PerDeviceFFHandle const &ff_handle) {
+  return device_handle_t{
+      ff_handle,
+  };
+}
+
+device_handle_t cpu_make_device_handle_t() {
+  return device_handle_t{std::monostate{}};
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/device_stream_t.cc b/lib/kernels/src/kernels/device_stream_t.cc
new file mode 100644
index 0000000000..8efa54d8bd
--- /dev/null
+++ b/lib/kernels/src/kernels/device_stream_t.cc
@@ -0,0 +1,25 @@
+#include "kernels/device_stream_t.h"
+#include <libassert/assert.hpp>
+
+namespace FlexFlow {
+
+device_stream_t get_gpu_device_stream() {
+  ffStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  return device_stream_t{stream};
+}
+
+device_stream_t get_cpu_device_stream() {
+  return device_stream_t{std::monostate{}};
+}
+
+device_stream_t get_stream_for_device_type(DeviceType device_type) {
+  if (device_type == DeviceType::GPU) {
+    return get_gpu_device_stream();
+  } else {
+    ASSERT(device_type == DeviceType::CPU);
+    return get_cpu_device_stream();
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/dropout_kernels.cc b/lib/kernels/src/kernels/dropout_kernels.cc
new file mode 100644
index 0000000000..ae1f3d4c0e
--- /dev/null
+++ b/lib/kernels/src/kernels/dropout_kernels.cc
@@ -0,0 +1,79 @@
+#include "kernels/dropout_kernels.h"
+#include "kernels/dropout_kernels_cpu.h"
+#include "kernels/dropout_kernels_gpu.h"
+
+namespace FlexFlow::Kernels::Dropout {
+
+std::optional<DropoutPerDeviceState>
+    init_kernel(DeviceType device_type,
+                device_handle_t const &handle,
+                float rate,
+                unsigned long long seed,
+                TensorShape const &output_shape,
+                Allocator &allocator) {
+  if (device_type == DeviceType::GPU) {
+    return gpu_init_kernel(
+        /*handle=*/handle.require_for_gpu(),
+        /*rate=*/rate,
+        /*seed=*/seed,
+        /*output_shape=*/output_shape,
+        /*allocator=*/allocator);
+  } else {
+    ASSERT(device_type == DeviceType::CPU);
+    ASSERT(handle.is_for_cpu());
+    return std::nullopt;
+  }
+}
+
+void forward_kernel(
+    device_stream_t const &stream,
+    std::optional<DropoutPerDeviceState> const &per_device_state,
+    float const *input_ptr,
+    float *output_ptr) {
+  if (stream.is_gpu()) {
+    gpu_forward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*per_device_state=*/per_device_state.value(),
+        /*input_ptr=*/input_ptr,
+        /*output_ptr=*/output_ptr);
+  } else {
+    ASSERT(stream.is_cpu());
+    ASSERT(per_device_state == std::nullopt);
+    cpu_forward_kernel(
+        /*input_ptr=*/input_ptr,
+        /*output_ptr=*/output_ptr);
+  }
+}
+
+void backward_kernel(
+    device_stream_t const &stream,
+    std::optional<DropoutPerDeviceState> const &per_device_state,
+    float const *output_grad_ptr,
+    float *input_grad_ptr) {
+  if (stream.is_gpu()) {
+    gpu_backward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*per_device_state=*/per_device_state.value(),
+        /*output_grad_ptr=*/output_grad_ptr,
+        /*input_grad_ptr=*/input_grad_ptr);
+  } else {
+    ASSERT(stream.is_cpu());
+    ASSERT(per_device_state == std::nullopt);
+    cpu_backward_kernel(
+        /*output_grad_ptr=*/output_grad_ptr,
+        /*input_grad_ptr=*/input_grad_ptr);
+  }
+}
+
+void cleanup_kernel(DeviceType device_type,
+                    Allocator &allocator,
+                    std::optional<DropoutPerDeviceState> &per_device_state) {
+  if (device_type == DeviceType::GPU) {
+    gpu_cleanup_kernel(allocator, per_device_state.value());
+  } else {
+    ASSERT(device_type == DeviceType::CPU);
+    ASSERT(per_device_state == std::nullopt);
+  }
+}
+
+} // namespace FlexFlow::Kernels::Dropout
diff --git a/lib/kernels/src/kernels/dropout_kernels_cpu.cc b/lib/kernels/src/kernels/dropout_kernels_cpu.cc
new file mode 100644
index 0000000000..f6558af96c
--- /dev/null
+++ b/lib/kernels/src/kernels/dropout_kernels_cpu.cc
@@ -0,0 +1,14 @@
+#include "kernels/dropout_kernels_cpu.h"
+#include "utils/exception.h"
+
+namespace FlexFlow::Kernels::Dropout {
+
+void cpu_forward_kernel(float const *input_ptr, float *output_ptr) {
+  NOT_IMPLEMENTED();
+}
+
+void cpu_backward_kernel(float const *output_grad_ptr, float *input_grad_ptr) {
+  NOT_IMPLEMENTED();
+}
+
+} // namespace FlexFlow::Kernels::Dropout
diff --git a/lib/kernels/src/kernels/element_binary_kernels.cc b/lib/kernels/src/kernels/element_binary_kernels.cc
new file mode 100644
index 0000000000..bea317dfec
--- /dev/null
+++ b/lib/kernels/src/kernels/element_binary_kernels.cc
@@ -0,0 +1,116 @@
+#include "kernels/element_binary_kernels.h"
+#include "kernels/element_binary_kernels_cpu.h"
+#include "kernels/element_binary_kernels_gpu.h"
+
+namespace FlexFlow::Kernels::ElementBinary {
+
+std::optional<ElementBinaryPerDeviceState>
+    init_kernel(DeviceType device_type,
+                device_handle_t const &handle,
+                OperatorType op_type,
+                bool should_broadcast_lhs,
+                bool should_broadcast_rhs,
+                TensorShape const &lhs_shape,
+                TensorShape const &rhs_shape,
+                TensorShape const &output_shape) {
+  if (device_type == DeviceType::GPU) {
+    return gpu_init_kernel(
+        /*handle=*/handle.require_for_gpu(),
+        /*op_type=*/op_type,
+        /*should_broadcast_lhs=*/should_broadcast_lhs,
+        /*should_broadcast_rhs=*/should_broadcast_rhs,
+        /*lhs_shape=*/lhs_shape,
+        /*rhs_shape=*/rhs_shape,
+        /*output_shape=*/output_shape);
+  } else {
+    ASSERT(device_type == DeviceType::CPU);
+    ASSERT(handle.is_for_cpu());
+    return std::nullopt;
+  }
+}
+
+void forward_kernel(
+    device_stream_t const &stream,
+    std::optional<ElementBinaryPerDeviceState> const &per_device_state,
+    float const *lhs_ptr,
+    float const *rhs_ptr,
+    float *out_ptr,
+    OperatorType op_type,
+    bool broadcast_inputLHS,
+    device_handle_t const &handle) {
+  if (stream.is_gpu()) {
+    gpu_forward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*per_device_state=*/per_device_state.value(),
+        /*lhs_ptr=*/lhs_ptr,
+        /*rhs_ptr=*/rhs_ptr,
+        /*out_ptr=*/out_ptr,
+        /*op_type=*/op_type,
+        /*broadcast_inputLHS=*/broadcast_inputLHS,
+        /*handle=*/handle.require_for_gpu());
+  } else {
+    ASSERT(stream.is_cpu());
+    ASSERT(per_device_state == std::nullopt);
+    ASSERT(handle.is_for_cpu());
+    cpu_forward_kernel(
+        /*lhs_ptr=*/lhs_ptr,
+        /*rhs_ptr=*/rhs_ptr,
+        /*out_ptr=*/out_ptr,
+        /*op_type=*/op_type,
+        /*broadcast_inputLHS=*/broadcast_inputLHS);
+  }
+}
+
+void backward_kernel(
+    device_stream_t const &stream,
+    std::optional<ElementBinaryPerDeviceState> const &per_device_state,
+    float const *out_grad_ptr,
+    float const *lhs_ptr,
+    float const *rhs_ptr,
+    float *lhs_grad_ptr,
+    float *rhs_grad_ptr,
+    OperatorType op_type,
+    bool broadcast_inputLHS,
+    bool broadcast_inputRHS,
+    device_handle_t const &handle) {
+  if (stream.is_gpu()) {
+    gpu_backward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*per_device_state=*/per_device_state.value(),
+        /*out_grad_ptr=*/out_grad_ptr,
+        /*lhs_ptr=*/lhs_ptr,
+        /*rhs_ptr=*/rhs_ptr,
+        /*lhs_grad_ptr=*/lhs_grad_ptr,
+        /*rhs_grad_ptr=*/rhs_grad_ptr,
+        /*op_type=*/op_type,
+        /*broadcast_inputLHS=*/broadcast_inputLHS,
+        /*broadcast_inputRHS=*/broadcast_inputRHS,
+        /*handle=*/handle.require_for_gpu());
+  } else {
+    ASSERT(stream.is_cpu());
+    ASSERT(per_device_state == std::nullopt);
+    ASSERT(handle.is_for_cpu());
+    cpu_backward_kernel(
+        /*out_grad_ptr=*/out_grad_ptr,
+        /*lhs_ptr=*/lhs_ptr,
+        /*rhs_ptr=*/rhs_ptr,
+        /*lhs_grad_ptr=*/lhs_grad_ptr,
+        /*rhs_grad_ptr=*/rhs_grad_ptr,
+        /*op_type=*/op_type,
+        /*broadcast_inputLHS=*/broadcast_inputLHS,
+        /*broadcast_inputRHS=*/broadcast_inputRHS);
+  }
+}
+
+void cleanup_kernel(
+    DeviceType device_type,
+    std::optional<ElementBinaryPerDeviceState> const &per_device_state) {
+  if (device_type == DeviceType::GPU) {
+    gpu_cleanup_kernel(per_device_state.value());
+  } else {
+    ASSERT(device_type == DeviceType::CPU);
+    ASSERT(per_device_state == std::nullopt);
+  }
+}
+
+} // namespace FlexFlow::Kernels::ElementBinary
diff --git a/lib/kernels/src/kernels/element_binary_kernels_cpu.cc b/lib/kernels/src/kernels/element_binary_kernels_cpu.cc
new file mode 100644
index 0000000000..cbcd98dc7e
--- /dev/null
+++ b/lib/kernels/src/kernels/element_binary_kernels_cpu.cc
@@ -0,0 +1,25 @@
+#include "kernels/element_binary_kernels_cpu.h"
+#include "utils/exception.h"
+
+namespace FlexFlow::Kernels::ElementBinary {
+
+void cpu_forward_kernel(float const *lhs_ptr,
+                        float const *rhs_ptr,
+                        float *out_ptr,
+                        OperatorType op_type,
+                        bool broadcast_inputLHS) {
+  NOT_IMPLEMENTED();
+}
+
+void cpu_backward_kernel(float const *out_grad_ptr,
+                         float const *lhs_ptr,
+                         float const *rhs_ptr,
+                         float *lhs_grad_ptr,
+                         float *rhs_grad_ptr,
+                         OperatorType op_type,
+                         bool broadcast_inputLHS,
+                         bool broadcast_inputRHS) {
+  NOT_IMPLEMENTED();
+}
+
+} // namespace FlexFlow::Kernels::ElementBinary
diff --git a/lib/kernels/src/kernels/element_unary_kernels.cc b/lib/kernels/src/kernels/element_unary_kernels.cc
new file mode 100644
index 0000000000..ff61385336
--- /dev/null
+++ b/lib/kernels/src/kernels/element_unary_kernels.cc
@@ -0,0 +1,92 @@
+#include "kernels/element_unary_kernels.h"
+#include "kernels/element_unary_kernels_cpu.h"
+#include "kernels/element_unary_kernels_gpu.h"
+
+namespace FlexFlow::Kernels::ElementUnary {
+
+std::optional<ElementUnaryPerDeviceState>
+    init_kernel(DeviceType device_type,
+                TensorShape const &input_shape,
+                TensorShape const &output_shape,
+                ElementUnaryAttrs const &attrs) {
+  if (device_type == DeviceType::GPU) {
+    return gpu_init_kernel(
+        /*input_shape=*/input_shape,
+        /*output_shape=*/output_shape,
+        /*attrs=*/attrs);
+  } else {
+    ASSERT(device_type == DeviceType::CPU);
+    return std::nullopt;
+  }
+}
+
+void forward_kernel(
+    device_stream_t const &stream,
+    std::optional<ElementUnaryPerDeviceState> const &per_device_state,
+    ElementUnaryAttrs const &attrs,
+    device_handle_t const &handle,
+    GenericTensorAccessorR const &input,
+    GenericTensorAccessorW const &output) {
+  if (stream.is_gpu()) {
+    gpu_forward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*per_device_state=*/per_device_state.value(),
+        /*attrs=*/attrs,
+        /*handle=*/handle.require_for_gpu(),
+        /*input=*/input,
+        /*output=*/output);
+  } else {
+    ASSERT(stream.is_cpu());
+    ASSERT(per_device_state == std::nullopt);
+    ASSERT(handle.is_for_cpu());
+    cpu_forward_kernel(
+        /*attrs=*/attrs,
+        /*input=*/input,
+        /*output=*/output);
+  }
+}
+
+void backward_kernel(
+    device_stream_t const &stream,
+    std::optional<ElementUnaryPerDeviceState> const &per_device_state,
+    ElementUnaryAttrs const &attrs,
+    device_handle_t const &handle,
+    GenericTensorAccessorR const &output,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &input,
+    GenericTensorAccessorW const &input_grad) {
+  if (stream.is_gpu()) {
+    gpu_backward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*per_device_state=*/per_device_state.value(),
+        /*attrs=*/attrs,
+        /*handle=*/handle.require_for_gpu(),
+        /*output=*/output,
+        /*output_grad=*/output_grad,
+        /*input=*/input,
+        /*input_grad=*/input_grad);
+  } else {
+    ASSERT(stream.is_cpu());
+    ASSERT(per_device_state == std::nullopt);
+    ASSERT(handle.is_for_cpu());
+    cpu_backward_kernel(
+        /*attrs=*/attrs,
+        /*output=*/output,
+        /*output_grad=*/output_grad,
+        /*input=*/input,
+        /*input_grad=*/input_grad);
+  }
+}
+
+void cleanup_kernel(
+    DeviceType device_type,
+    std::optional<ElementUnaryPerDeviceState> &per_device_state) {
+  if (device_type == DeviceType::GPU) {
+    gpu_cleanup_kernel(per_device_state.value());
+  } else {
+    ASSERT(device_type == DeviceType::CPU);
+    ASSERT(per_device_state == std::nullopt);
+  }
+}
+
+} // namespace FlexFlow::Kernels::ElementUnary
diff --git a/lib/kernels/src/kernels/element_unary_kernels_cpu.cc b/lib/kernels/src/kernels/element_unary_kernels_cpu.cc
new file mode 100644
index 0000000000..0c2f521b96
--- /dev/null
+++ b/lib/kernels/src/kernels/element_unary_kernels_cpu.cc
@@ -0,0 +1,19 @@
+#include "kernels/element_unary_kernels_cpu.h"
+
+namespace FlexFlow::Kernels::ElementUnary {
+
+void cpu_forward_kernel(ElementUnaryAttrs const &attrs,
+                        GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output) {
+  NOT_IMPLEMENTED();
+}
+
+void cpu_backward_kernel(ElementUnaryAttrs const &attrs,
+                         GenericTensorAccessorR const &output,
+                         GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorR const &input,
+                         GenericTensorAccessorW const &input_grad) {
+  NOT_IMPLEMENTED();
+}
+
+} // namespace FlexFlow::Kernels::ElementUnary
diff --git a/lib/kernels/src/kernels/embedding_kernels.cc b/lib/kernels/src/kernels/embedding_kernels.cc
new file mode 100644
index 0000000000..957d297b9e
--- /dev/null
+++ b/lib/kernels/src/kernels/embedding_kernels.cc
@@ -0,0 +1,81 @@
+#include "kernels/embedding_kernels.h"
+#include "kernels/embedding_kernels_cpu.h"
+#include "kernels/embedding_kernels_gpu.h"
+
+namespace FlexFlow::Kernels::Embedding {
+
+void forward_kernel(device_stream_t const &stream,
+                    GenericTensorAccessorR const &input,
+                    GenericTensorAccessorW const &output,
+                    GenericTensorAccessorR const &weight,
+                    DataType input_data_type,
+                    DataType output_data_type,
+                    std::optional<AggregateOp> aggr,
+                    int in_dim,
+                    int out_dim,
+                    int batch_size) {
+  if (stream.is_gpu()) {
+    gpu_forward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*input=*/input,
+        /*output=*/output,
+        /*weight=*/weight,
+        /*input_data_type=*/input_data_type,
+        /*output_data_type=*/output_data_type,
+        /*aggr=*/aggr,
+        /*in_dim=*/in_dim,
+        /*out_dim=*/out_dim,
+        /*batch_size=*/batch_size);
+  } else {
+    ASSERT(stream.is_cpu());
+    cpu_forward_kernel(
+        /*input=*/input,
+        /*output=*/output,
+        /*weight=*/weight,
+        /*input_data_type=*/input_data_type,
+        /*output_data_type=*/output_data_type,
+        /*aggr=*/aggr,
+        /*in_dim=*/in_dim,
+        /*out_dim=*/out_dim,
+        /*batch_size=*/batch_size);
+  }
+}
+
+void backward_kernel(device_stream_t const &stream,
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorR const &input,
+                     GenericTensorAccessorW const &weight_grad,
+                     DataType output_data_type,
+                     DataType input_data_type,
+                     std::optional<AggregateOp> aggr,
+                     int in_dim,
+                     int out_dim,
+                     int batch_size) {
+  if (stream.is_gpu()) {
+    gpu_backward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*output=*/output,
+        /*input=*/input,
+        /*weight_grad=*/weight_grad,
+        /*output_data_type=*/output_data_type,
+        /*input_data_type=*/input_data_type,
+        /*aggr=*/aggr,
+        /*in_dim=*/in_dim,
+        /*out_dim=*/out_dim,
+        /*batch_size=*/batch_size);
+  } else {
+    ASSERT(stream.is_cpu());
+    cpu_backward_kernel(
+        /*output=*/output,
+        /*input=*/input,
+        /*weight_grad=*/weight_grad,
+        /*output_data_type=*/output_data_type,
+        /*input_data_type=*/input_data_type,
+        /*aggr=*/aggr,
+        /*in_dim=*/in_dim,
+        /*out_dim=*/out_dim,
+        /*batch_size=*/batch_size);
+  }
+}
+
+} // namespace FlexFlow::Kernels::Embedding
diff --git a/lib/kernels/src/kernels/embedding_kernels_cpu.cc b/lib/kernels/src/kernels/embedding_kernels_cpu.cc
new file mode 100644
index 0000000000..f5df53e322
--- /dev/null
+++ b/lib/kernels/src/kernels/embedding_kernels_cpu.cc
@@ -0,0 +1,29 @@
+#include "kernels/embedding_kernels_cpu.h"
+
+namespace FlexFlow::Kernels::Embedding {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output,
+                        GenericTensorAccessorR const &weight,
+                        DataType input_data_type,
+                        DataType output_data_type,
+                        std::optional<AggregateOp> aggr,
+                        int in_dim,
+                        int out_dim,
+                        int batch_size) {
+  NOT_IMPLEMENTED();
+}
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output,
+                         GenericTensorAccessorR const &input,
+                         GenericTensorAccessorW const &weight_grad,
+                         DataType output_data_type,
+                         DataType input_data_type,
+                         std::optional<AggregateOp> aggr,
+                         int in_dim,
+                         int out_dim,
+                         int batch_size) {
+  NOT_IMPLEMENTED();
+}
+
+} // namespace FlexFlow::Kernels::Embedding
diff --git a/lib/kernels/src/kernels/fill_tensor_accessor.cc b/lib/kernels/src/kernels/fill_tensor_accessor.cc
index bee8d12556..7b5a01d03f 100644
--- a/lib/kernels/src/kernels/fill_tensor_accessor.cc
+++ b/lib/kernels/src/kernels/fill_tensor_accessor.cc
@@ -1,11 +1,33 @@
 #include "kernels/fill_tensor_accessor.h"
+#include "kernels/datatype_dispatch.h"
 #include "op-attrs/datatype_value.h"
+#include "op-attrs/tensor_shape.h"
 
 namespace FlexFlow {
 
-void fill_tensor_accessor(GenericTensorAccessorW &accessor, DataTypeValue val) {
-  ASSERT(accessor.device_type == DeviceType::CPU);
-  ASSERT(accessor.data_type == get_data_type_of_data_type_value(val));
+template <DataType DT>
+struct FillWithZeros {
+  void operator()(GenericTensorAccessorW const &accessor) {
+    using T = real_type_t<DT>;
+
+    if (accessor.device_type == DeviceType::CPU) {
+      memset(accessor.ptr,
+             0,
+             get_size_in_bytes(accessor.shape)
+                 .unwrap_num_bytes()
+                 .unwrap_nonnegative());
+    } else {
+      checkCUDA(cudaMemset(accessor.ptr,
+                           0,
+                           get_size_in_bytes(accessor.shape)
+                               .unwrap_num_bytes()
+                               .unwrap_nonnegative()));
+    }
+  }
+};
+
+void fill_with_zeros(GenericTensorAccessorW const &accessor) {
+  DataTypeDispatch1<FillWithZeros>{}(accessor.shape.data_type, accessor);
 }
 
 GenericTensorAccessorW create_accessor_w_filled_with(
diff --git a/lib/kernels/src/kernels/flat_kernels.cc b/lib/kernels/src/kernels/flat_kernels.cc
new file mode 100644
index 0000000000..1032e081e7
--- /dev/null
+++ b/lib/kernels/src/kernels/flat_kernels.cc
@@ -0,0 +1,42 @@
+#include "kernels/flat_kernels.h"
+#include "kernels/flat_kernels_cpu.h"
+#include "kernels/flat_kernels_gpu.h"
+
+namespace FlexFlow::Kernels::Flat {
+
+void forward_kernel(device_stream_t const &stream,
+                    GenericTensorAccessorR const &input,
+                    float *output_ptr) {
+  if (stream.is_gpu()) {
+    gpu_forward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*input=*/input,
+        /*output_ptr=*/output_ptr);
+  } else {
+    ASSERT(stream.is_cpu());
+    cpu_forward_kernel(
+        /*input=*/input,
+        /*output_ptr=*/output_ptr);
+  }
+}
+
+void backward_kernel(device_stream_t const &stream,
+                     GenericTensorAccessorR const &input,
+                     float const *output_grad_ptr,
+                     float *input_grad_ptr) {
+  if (stream.is_gpu()) {
+    gpu_backward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*input=*/input,
+        /*output_grad_ptr=*/output_grad_ptr,
+        /*input_grad_ptr=*/input_grad_ptr);
+  } else {
+    ASSERT(stream.is_cpu());
+    cpu_backward_kernel(
+        /*input=*/input,
+        /*output_grad_ptr=*/output_grad_ptr,
+        /*input_grad_ptr=*/input_grad_ptr);
+  }
+}
+
+} // namespace FlexFlow::Kernels::Flat
diff --git a/lib/kernels/src/kernels/flat_kernels_cpu.cc b/lib/kernels/src/kernels/flat_kernels_cpu.cc
new file mode 100644
index 0000000000..b7de8dd8ff
--- /dev/null
+++ b/lib/kernels/src/kernels/flat_kernels_cpu.cc
@@ -0,0 +1,16 @@
+#include "kernels/flat_kernels_cpu.h"
+
+namespace FlexFlow::Kernels::Flat {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        float *output_ptr) {
+  NOT_IMPLEMENTED();
+}
+
+void cpu_backward_kernel(GenericTensorAccessorR const &input,
+                         float const *output_grad_ptr,
+                         float *input_grad_ptr) {
+  NOT_IMPLEMENTED();
+}
+
+} // namespace FlexFlow::Kernels::Flat
diff --git a/lib/kernels/src/kernels/format_accessor_contents.cc b/lib/kernels/src/kernels/format_accessor_contents.cc
index d40e5c4268..cbdf2870dd 100644
--- a/lib/kernels/src/kernels/format_accessor_contents.cc
+++ b/lib/kernels/src/kernels/format_accessor_contents.cc
@@ -2,6 +2,7 @@
 #include "kernels/copy_tensor_accessor.h"
 #include "kernels/datatype_dispatch.h"
 #include "kernels/local_cpu_allocator.h"
+#include "op-attrs/tensor_shape.h"
 #include "utils/indent.h"
 #include "utils/nonnegative_int/nonnegative_range.h"
 #include <libassert/assert.hpp>
@@ -13,17 +14,18 @@ struct Print1DCPUAccessorR {
   void operator()(GenericTensorAccessorR const &accessor,
                   std::ostream &stream) {
     ASSERT(accessor.device_type == DeviceType::CPU);
-    nonnegative_int dims = accessor.shape.num_dims();
+    nonnegative_int dims = get_num_dims(accessor.shape.dims);
     ASSERT(dims == 1_n);
 
-    positive_int ncols = accessor.shape.at(ff_dim_t{0_n});
+    positive_int ncols = dim_at_idx(accessor.shape.dims, ff_dim_t{0_n});
 
     stream << "["
            << join_strings(
                   nonnegative_range(ncols.nonnegative_int_from_positive_int()),
                   " ",
                   [&](nonnegative_int col_idx) -> std::string {
-                    return fmt::to_string(accessor.at<DT>(FFOrdered{col_idx}));
+                    return fmt::to_string(
+                        accessor.at<DT>(TensorDimsCoord{FFOrdered{col_idx}}));
                   })
            << "]";
   }
@@ -32,10 +34,11 @@ struct Print1DCPUAccessorR {
 static std::string
     format_1d_accessor_r_contents(GenericTensorAccessorR const &accessor) {
   ASSERT(accessor.device_type == DeviceType::CPU);
-  ASSERT(accessor.shape.num_dims() == 1_n);
+  ASSERT(get_num_dims(accessor.shape.dims) == 1_n);
 
   std::ostringstream oss;
-  DataTypeDispatch1<Print1DCPUAccessorR>{}(accessor.data_type, accessor, oss);
+  DataTypeDispatch1<Print1DCPUAccessorR>{}(
+      accessor.shape.data_type, accessor, oss);
   return oss.str();
 }
 
@@ -44,20 +47,21 @@ struct Print2DCPUAccessorR {
   void operator()(GenericTensorAccessorR const &accessor,
                   std::ostream &stream) {
     ASSERT(accessor.device_type == DeviceType::CPU);
-    nonnegative_int dims = accessor.shape.num_dims();
+    nonnegative_int dims = get_num_dims(accessor.shape.dims);
     ASSERT(dims == 2_n);
-    positive_int dim0_size = accessor.shape.at(ff_dim_t{0_n});
-    positive_int dim1_size = accessor.shape.at(ff_dim_t{1_n});
+    positive_int dim0_size = dim_at_idx(accessor.shape.dims, ff_dim_t{0_n});
+    positive_int dim1_size = dim_at_idx(accessor.shape.dims, ff_dim_t{1_n});
 
     auto render_1d = [&](nonnegative_int dim0_idx) -> std::string {
       return "[" +
-             join_strings(nonnegative_range(
-                              dim1_size.nonnegative_int_from_positive_int()),
-                          " ",
-                          [&](nonnegative_int dim1_idx) -> std::string {
-                            return fmt::to_string(
-                                accessor.at<DT>(FFOrdered{dim0_idx, dim1_idx}));
-                          }) +
+             join_strings(
+                 nonnegative_range(
+                     dim1_size.nonnegative_int_from_positive_int()),
+                 " ",
+                 [&](nonnegative_int dim1_idx) -> std::string {
+                   return fmt::to_string(accessor.at<DT>(
+                       TensorDimsCoord{FFOrdered{dim0_idx, dim1_idx}}));
+                 }) +
              "]";
     };
 
@@ -74,10 +78,11 @@ struct Print2DCPUAccessorR {
 static std::string
     format_2d_accessor_r_contents(GenericTensorAccessorR const &accessor) {
   ASSERT(accessor.device_type == DeviceType::CPU);
-  ASSERT(accessor.shape.num_dims() == 2_n);
+  ASSERT(get_num_dims(accessor.shape.dims) == 2_n);
 
   std::ostringstream oss;
-  DataTypeDispatch1<Print2DCPUAccessorR>{}(accessor.data_type, accessor, oss);
+  DataTypeDispatch1<Print2DCPUAccessorR>{}(
+      accessor.shape.data_type, accessor, oss);
   return oss.str();
 }
 
@@ -86,12 +91,12 @@ struct Print3DCPUAccessorR {
   void operator()(GenericTensorAccessorR const &accessor,
                   std::ostream &stream) {
     ASSERT(accessor.device_type == DeviceType::CPU);
-    nonnegative_int dims = accessor.shape.num_dims();
+    nonnegative_int dims = get_num_dims(accessor.shape.dims);
     ASSERT(dims == 3_n);
 
-    positive_int dim0_size = accessor.shape.at(ff_dim_t{0_n});
-    positive_int dim1_size = accessor.shape.at(ff_dim_t{1_n});
-    positive_int dim2_size = accessor.shape.at(ff_dim_t{2_n});
+    positive_int dim0_size = dim_at_idx(accessor.shape.dims, ff_dim_t{0_n});
+    positive_int dim1_size = dim_at_idx(accessor.shape.dims, ff_dim_t{1_n});
+    positive_int dim2_size = dim_at_idx(accessor.shape.dims, ff_dim_t{2_n});
 
     auto render_1d = [&](nonnegative_int dim0_idx,
                          nonnegative_int dim1_idx) -> std::string {
@@ -100,8 +105,9 @@ struct Print3DCPUAccessorR {
                               dim2_size.nonnegative_int_from_positive_int()),
                           " ",
                           [&](nonnegative_int dim2_idx) -> std::string {
-                            return fmt::to_string(accessor.at<DT>(
-                                FFOrdered{dim0_idx, dim1_idx, dim2_idx}));
+                            return fmt::to_string(
+                                accessor.at<DT>(TensorDimsCoord{
+                                    FFOrdered{dim0_idx, dim1_idx, dim2_idx}}));
                           }) +
              "]";
     };
@@ -131,10 +137,85 @@ struct Print3DCPUAccessorR {
 static std::string
     format_3d_accessor_r_contents(GenericTensorAccessorR const &accessor) {
   ASSERT(accessor.device_type == DeviceType::CPU);
-  ASSERT(accessor.shape.num_dims() == 3_n);
+  ASSERT(get_num_dims(accessor.shape.dims) == 3_n);
 
   std::ostringstream oss;
-  DataTypeDispatch1<Print3DCPUAccessorR>{}(accessor.data_type, accessor, oss);
+  DataTypeDispatch1<Print3DCPUAccessorR>{}(
+      accessor.shape.data_type, accessor, oss);
+  return oss.str();
+}
+
+template <DataType DT>
+struct Print4DCPUAccessorR {
+  void operator()(GenericTensorAccessorR const &accessor,
+                  std::ostream &stream) {
+    ASSERT(accessor.device_type == DeviceType::CPU);
+    nonnegative_int dims = get_num_dims(accessor.shape.dims);
+    ASSERT(dims == 4_n);
+
+    positive_int dim0_size = dim_at_idx(accessor.shape.dims, ff_dim_t{0_n});
+    positive_int dim1_size = dim_at_idx(accessor.shape.dims, ff_dim_t{1_n});
+    positive_int dim2_size = dim_at_idx(accessor.shape.dims, ff_dim_t{2_n});
+    positive_int dim3_size = dim_at_idx(accessor.shape.dims, ff_dim_t{3_n});
+
+    auto render_1d = [&](nonnegative_int dim0_idx,
+                         nonnegative_int dim1_idx,
+                         nonnegative_int dim2_idx) -> std::string {
+      return "[" +
+             join_strings(
+                 nonnegative_range(
+                     dim3_size.nonnegative_int_from_positive_int()),
+                 " ",
+                 [&](nonnegative_int dim3_idx) -> std::string {
+                   return fmt::to_string(accessor.at<DT>(TensorDimsCoord{
+                       FFOrdered{dim0_idx, dim1_idx, dim2_idx, dim3_idx}}));
+                 }) +
+             "]";
+    };
+
+    auto render_2d = [&](nonnegative_int dim0_idx,
+                         nonnegative_int dim1_idx) -> std::string {
+      return "[\n" +
+             indent(join_strings(
+                 nonnegative_range(
+                     dim2_size.nonnegative_int_from_positive_int()),
+                 "\n",
+                 [&](nonnegative_int dim2_idx) -> std::string {
+                   return render_1d(dim0_idx, dim1_idx, dim2_idx);
+                 })) +
+             "\n]";
+    };
+
+    auto render_3d = [&](nonnegative_int dim0_idx) -> std::string {
+      return "[\n" +
+             indent(join_strings(
+                 nonnegative_range(
+                     dim1_size.nonnegative_int_from_positive_int()),
+                 "\n",
+                 [&](nonnegative_int dim1_idx) -> std::string {
+                   return render_2d(dim0_idx, dim1_idx);
+                 })) +
+             "\n]";
+    };
+
+    stream << "[\n"
+           << indent(join_strings(
+                  nonnegative_range(
+                      dim0_size.nonnegative_int_from_positive_int()),
+                  "\n",
+                  render_3d))
+           << "\n]";
+  }
+};
+
+static std::string
+    format_4d_accessor_r_contents(GenericTensorAccessorR const &accessor) {
+  ASSERT(accessor.device_type == DeviceType::CPU);
+  ASSERT(get_num_dims(accessor.shape.dims) == 4_n);
+
+  std::ostringstream oss;
+  DataTypeDispatch1<Print4DCPUAccessorR>{}(
+      accessor.shape.data_type, accessor, oss);
   return oss.str();
 }
 
@@ -156,12 +237,18 @@ static std::string
       read_only_accessor_from_write_accessor(accessor));
 }
 
+static std::string
+    format_4d_accessor_w_contents(GenericTensorAccessorW const &accessor) {
+  return format_4d_accessor_r_contents(
+      read_only_accessor_from_write_accessor(accessor));
+}
+
 std::string format_accessor_r_contents(GenericTensorAccessorR const &accessor) {
   Allocator cpu_allocator = create_local_cpu_memory_allocator();
   GenericTensorAccessorR cpu_accessor =
       copy_tensor_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator);
 
-  int num_dims = cpu_accessor.shape.num_dims().unwrap_nonnegative();
+  int num_dims = get_num_dims(cpu_accessor.shape.dims).unwrap_nonnegative();
   switch (num_dims) {
     case 1:
       return format_1d_accessor_r_contents(cpu_accessor);
@@ -169,6 +256,8 @@ std::string format_accessor_r_contents(GenericTensorAccessorR const &accessor) {
       return format_2d_accessor_r_contents(cpu_accessor);
     case 3:
       return format_3d_accessor_r_contents(cpu_accessor);
+    case 4:
+      return format_4d_accessor_r_contents(cpu_accessor);
     default:
       PANIC("Unhandled accessor dimensionality", num_dims);
   }
@@ -179,7 +268,7 @@ std::string format_accessor_w_contents(GenericTensorAccessorW const &accessor) {
   GenericTensorAccessorW cpu_accessor =
       copy_tensor_accessor_w_to_cpu_if_necessary(accessor, cpu_allocator);
 
-  int num_dims = cpu_accessor.shape.num_dims().unwrap_nonnegative();
+  int num_dims = get_num_dims(cpu_accessor.shape.dims).unwrap_nonnegative();
   switch (num_dims) {
     case 1:
       return format_1d_accessor_w_contents(cpu_accessor);
@@ -187,6 +276,8 @@ std::string format_accessor_w_contents(GenericTensorAccessorW const &accessor) {
       return format_2d_accessor_w_contents(cpu_accessor);
     case 3:
       return format_3d_accessor_w_contents(cpu_accessor);
+    case 4:
+      return format_4d_accessor_w_contents(cpu_accessor);
     default:
       PANIC("Unhandled accessor dimensionality", num_dims);
   }
diff --git a/lib/kernels/src/kernels/gather_kernels.cc b/lib/kernels/src/kernels/gather_kernels.cc
new file mode 100644
index 0000000000..a21d132511
--- /dev/null
+++ b/lib/kernels/src/kernels/gather_kernels.cc
@@ -0,0 +1,66 @@
+#include "kernels/gather_kernels.h"
+#include "kernels/gather_kernels_cpu.h"
+#include "kernels/gather_kernels_gpu.h"
+
+namespace FlexFlow::Kernels::Gather {
+
+std::optional<GatherPerDeviceState> init_kernel(DeviceType device_type,
+                                                device_handle_t const &handle,
+                                                ff_dim_t dim) {
+  if (device_type == DeviceType::GPU) {
+    return gpu_init_kernel(
+        /*handle=*/handle.require_for_gpu(),
+        /*dim=*/dim);
+  } else {
+    ASSERT(device_type == DeviceType::CPU);
+    ASSERT(handle.is_for_cpu());
+    return std::nullopt;
+  }
+}
+
+void forward_kernel(device_stream_t const &stream,
+                    std::optional<GatherPerDeviceState> const &per_device_state,
+                    GenericTensorAccessorR const &input,
+                    GenericTensorAccessorR const &index,
+                    GenericTensorAccessorW const &output) {
+  if (stream.is_gpu()) {
+    gpu_forward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*per_device_state=*/per_device_state.value(),
+        /*input=*/input,
+        /*index=*/index,
+        /*output=*/output);
+  } else {
+    ASSERT(stream.is_cpu());
+    ASSERT(per_device_state == std::nullopt);
+    cpu_forward_kernel(
+        /*input=*/input,
+        /*index=*/index,
+        /*output=*/output);
+  }
+}
+
+void backward_kernel(
+    device_stream_t const &stream,
+    std::optional<GatherPerDeviceState> const &per_device_state,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &index,
+    GenericTensorAccessorW const &input_grad) {
+  if (stream.is_gpu()) {
+    gpu_backward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*per_device_state=*/per_device_state.value(),
+        /*output_grad=*/output_grad,
+        /*index=*/index,
+        /*input_grad=*/input_grad);
+  } else {
+    ASSERT(stream.is_cpu());
+    ASSERT(per_device_state == std::nullopt);
+    cpu_backward_kernel(
+        /*output_grad=*/output_grad,
+        /*index=*/index,
+        /*input_grad=*/input_grad);
+  }
+}
+
+} // namespace FlexFlow::Kernels::Gather
diff --git a/lib/kernels/src/kernels/gather_kernels_cpu.cc b/lib/kernels/src/kernels/gather_kernels_cpu.cc
new file mode 100644
index 0000000000..ed216802b3
--- /dev/null
+++ b/lib/kernels/src/kernels/gather_kernels_cpu.cc
@@ -0,0 +1,17 @@
+#include "kernels/gather_kernels_cpu.h"
+
+namespace FlexFlow::Kernels::Gather {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorR const &index,
+                        GenericTensorAccessorW const &output) {
+  NOT_IMPLEMENTED();
+}
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorR const &index,
+                         GenericTensorAccessorW const &input_grad) {
+  NOT_IMPLEMENTED();
+}
+
+} // namespace FlexFlow::Kernels::Gather
diff --git a/lib/kernels/src/kernels/layer_norm_kernels.cc b/lib/kernels/src/kernels/layer_norm_kernels.cc
new file mode 100644
index 0000000000..3db6ec734c
--- /dev/null
+++ b/lib/kernels/src/kernels/layer_norm_kernels.cc
@@ -0,0 +1,99 @@
+#include "kernels/layer_norm_kernels.h"
+#include "kernels/layer_norm_kernels_cpu.h"
+#include "kernels/layer_norm_kernels_gpu.h"
+
+namespace FlexFlow::Kernels::LayerNorm {
+
+std::optional<LayerNormPerDeviceState>
+    init_kernel(DeviceType device_type,
+                device_handle_t const &handle,
+                Allocator &allocator,
+                bool elementwise_affine,
+                int64_t effective_batch_size,
+                int64_t effective_num_elements,
+                float eps) {
+  if (device_type == DeviceType::GPU) {
+    return gpu_init_kernel(
+        /*handle=*/handle.require_for_gpu(),
+        /*allocator=*/allocator,
+        /*elementwise_affine=*/elementwise_affine,
+        /*effective_batch_size=*/effective_batch_size,
+        /*effective_num_elements=*/effective_num_elements,
+        /*eps=*/eps);
+  } else {
+    ASSERT(device_type == DeviceType::CPU);
+    ASSERT(handle.is_for_cpu());
+    return std::nullopt;
+  }
+}
+
+void forward_kernel(
+    device_stream_t const &stream,
+    std::optional<LayerNormPerDeviceState> const &per_device_state,
+    GenericTensorAccessorR const &input,
+    GenericTensorAccessorW const &output,
+    GenericTensorAccessorW const &gamma,
+    GenericTensorAccessorW const &beta) {
+  if (stream.is_gpu()) {
+    gpu_forward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*per_device_state=*/per_device_state.value(),
+        /*input=*/input,
+        /*output=*/output,
+        /*gamma=*/gamma,
+        /*beta=*/beta);
+  } else {
+    ASSERT(stream.is_cpu());
+    ASSERT(per_device_state == std::nullopt);
+    cpu_forward_kernel(
+        /*input=*/input,
+        /*output=*/output,
+        /*gamma=*/gamma,
+        /*beta=*/beta);
+  }
+}
+
+void backward_kernel(
+    device_stream_t const &stream,
+    std::optional<LayerNormPerDeviceState> const &per_device_state,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &input,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorR const &gamma,
+    GenericTensorAccessorW const &gamma_grad,
+    GenericTensorAccessorW const &beta_grad) {
+  if (stream.is_gpu()) {
+    gpu_backward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*per_device_state=*/per_device_state.value(),
+        /*output_grad=*/output_grad,
+        /*input=*/input,
+        /*input_grad=*/input_grad,
+        /*gamma=*/gamma,
+        /*gamma_grad=*/gamma_grad,
+        /*beta_grad=*/beta_grad);
+  } else {
+    ASSERT(stream.is_cpu());
+    ASSERT(per_device_state == std::nullopt);
+    cpu_backward_kernel(
+        /*output_grad=*/output_grad,
+        /*input=*/input,
+        /*input_grad=*/input_grad,
+        /*gamma=*/gamma,
+        /*gamma_grad=*/gamma_grad,
+        /*beta_grad=*/beta_grad);
+  }
+}
+
+void cleanup_kernel(
+    DeviceType device_type,
+    std::optional<LayerNormPerDeviceState> const &per_device_state) {
+  if (device_type == DeviceType::GPU) {
+    gpu_cleanup_kernel(per_device_state.value());
+  } else {
+    ASSERT(device_type == DeviceType::CPU);
+    ASSERT(per_device_state == std::nullopt);
+  }
+}
+
+} // namespace FlexFlow::Kernels::LayerNorm
diff --git a/lib/kernels/src/kernels/layer_norm_kernels_cpu.cc b/lib/kernels/src/kernels/layer_norm_kernels_cpu.cc
new file mode 100644
index 0000000000..f6922f7cf4
--- /dev/null
+++ b/lib/kernels/src/kernels/layer_norm_kernels_cpu.cc
@@ -0,0 +1,21 @@
+#include "kernels/layer_norm_kernels_cpu.h"
+
+namespace FlexFlow::Kernels::LayerNorm {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output,
+                        GenericTensorAccessorW const &gamma,
+                        GenericTensorAccessorW const &beta) {
+  NOT_IMPLEMENTED();
+}
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output_grad,
+                         GenericTensorAccessorR const &input,
+                         GenericTensorAccessorW const &input_grad,
+                         GenericTensorAccessorR const &gamma,
+                         GenericTensorAccessorW const &gamma_grad,
+                         GenericTensorAccessorW const &beta_grad) {
+  NOT_IMPLEMENTED();
+}
+
+} // namespace FlexFlow::Kernels::LayerNorm
diff --git a/lib/kernels/src/kernels/legion_dim.cc b/lib/kernels/src/kernels/legion_dim.cc
index 47b5b3e5ed..f3fa67387a 100644
--- a/lib/kernels/src/kernels/legion_dim.cc
+++ b/lib/kernels/src/kernels/legion_dim.cc
@@ -1,8 +1,22 @@
 #include "kernels/legion_dim.h"
+#include "op-attrs/tensor_dims.h"
 #include "utils/archetypes/value_type.h"
 
 namespace FlexFlow {
 
+positive_int dim_at_idx(TensorDims const &tensor_dims,
+                        legion_dim_t legion_dim) {
+  return dim_at_idx(
+      tensor_dims,
+      ff_dim_from_legion_dim(legion_dim, get_num_dims(tensor_dims)));
+}
+
+positive_int &dim_at_idx(TensorDims &tensor_dims, legion_dim_t legion_dim) {
+  return dim_at_idx(
+      tensor_dims,
+      ff_dim_from_legion_dim(legion_dim, get_num_dims(tensor_dims)));
+}
+
 using T = value_type<0>;
 template std::set<legion_dim_t> key_range(LegionOrdered<T> const &);
 
diff --git a/lib/kernels/src/kernels/linear_kernels.cc b/lib/kernels/src/kernels/linear_kernels.cc
new file mode 100644
index 0000000000..f301e89b6e
--- /dev/null
+++ b/lib/kernels/src/kernels/linear_kernels.cc
@@ -0,0 +1,148 @@
+#include "kernels/linear_kernels.h"
+#include "kernels/copy_tensor_accessor.h"
+#include "kernels/linear_kernels_cpu.h"
+#include "kernels/linear_kernels_gpu.h"
+#include "kernels/local_cuda_allocator.h"
+#include <libassert/assert.hpp>
+
+using namespace FlexFlow::Kernels::Linear;
+
+namespace FlexFlow {
+
+std::optional<LinearPerDeviceState>
+    linear_init_kernel(DeviceType device_type,
+                       device_handle_t const &handle,
+                       std::optional<Activation> activation,
+                       std::optional<RegularizerAttrs> regularizer,
+                       bool use_bias,
+                       DataType input_type,
+                       DataType weight_type,
+                       DataType output_type,
+                       int batch_size,
+                       int channel) {
+  if (device_type == DeviceType::GPU) {
+    return gpu_init_kernel(
+        /*handle=*/handle.require_for_gpu(),
+        /*activation=*/activation,
+        /*regularizer=*/regularizer,
+        /*use_bias=*/use_bias,
+        /*input_type=*/input_type,
+        /*weight_type=*/weight_type,
+        /*output_type=*/output_type,
+        /*batch_size=*/batch_size,
+        /*channel=*/channel);
+  } else {
+    ASSERT(device_type == DeviceType::CPU);
+    return std::nullopt;
+  }
+}
+
+void linear_forward_kernel(
+    device_stream_t const &stream,
+    std::optional<LinearPerDeviceState> const &per_device_state,
+    LinearAttrs const &attrs,
+    GenericTensorAccessorR const &input_accessor,
+    GenericTensorAccessorW const &output_accessor,
+    GenericTensorAccessorR const &filter_accessor,
+    std::optional<GenericTensorAccessorR> const &bias_accessor) {
+  if (stream.is_gpu()) {
+    positive_int in_dim = dim_at_idx(input_accessor.shape.dims, ff_dim_t{1_n});
+    positive_int out_dim =
+        dim_at_idx(output_accessor.shape.dims, ff_dim_t{1_n});
+    positive_int batch_size =
+        dim_at_idx(input_accessor.shape.dims, ff_dim_t{0_n});
+
+    float const *bias_ptr = nullptr;
+    if (bias_accessor.has_value()) {
+      bias_ptr = bias_accessor.value().get<DataType::FLOAT>();
+    }
+
+    ASSERT(per_device_state.has_value());
+    gpu_forward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*per_device_state=*/per_device_state.value(),
+        /*input_ptr=*/input_accessor.get_float_ptr(),
+        /*output_ptr=*/output_accessor.get_float_ptr(),
+        /*filter_ptr=*/filter_accessor.get_float_ptr(),
+        /*bias_ptr=*/bias_ptr,
+        /*in_dim=*/in_dim.int_from_positive_int(),
+        /*out_dim=*/out_dim.int_from_positive_int(),
+        /*batch_size=*/batch_size.int_from_positive_int());
+  } else {
+    ASSERT(stream.is_cpu());
+    ASSERT(per_device_state == std::nullopt);
+    linear_cpu_forward_kernel(
+        /*attrs=*/attrs,
+        /*input_accessor=*/input_accessor,
+        /*output_accessor=*/output_accessor,
+        /*filter_accessor=*/filter_accessor,
+        /*bias_accessor=*/bias_accessor);
+  }
+}
+
+void linear_backward_kernel(
+    device_stream_t const &stream,
+    std::optional<LinearPerDeviceState> const &per_device_state,
+    LinearAttrs const &attrs,
+    GenericTensorAccessorR const &output,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &input,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorR const &kernel,
+    GenericTensorAccessorW const &kernel_grad,
+    std::optional<GenericTensorAccessorW> const &bias_grad) {
+  if (stream.is_gpu()) {
+    float *bias_grad_ptr =
+        transform(bias_grad, [](GenericTensorAccessorW const &b) {
+          return b.get_float_ptr();
+        }).value_or(nullptr);
+
+    positive_int in_dim = dim_at_idx(input.shape.dims, ff_dim_t{1_n});
+    positive_int out_dim = dim_at_idx(output.shape.dims, ff_dim_t{1_n});
+    positive_int batch_size = dim_at_idx(input.shape.dims, ff_dim_t{0_n});
+
+    Allocator gpu_allocator = create_local_cuda_memory_allocator();
+    GenericTensorAccessorW modifiable_output_grad =
+        copy_tensor_accessor_r(output_grad, gpu_allocator);
+
+    ASSERT(per_device_state.has_value());
+    gpu_backward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*per_device_state=*/per_device_state.value(),
+        /*output_ptr=*/output.get_float_ptr(),
+        /*output_grad_ptr=*/modifiable_output_grad.get_float_ptr(),
+        /*input_ptr=*/input.get_float_ptr(),
+        /*input_grad_ptr=*/input_grad.get_float_ptr(),
+        /*kernel_ptr=*/kernel.get_float_ptr(),
+        /*kernel_grad_ptr=*/kernel_grad.get_float_ptr(),
+        /*bias_grad_ptr=*/bias_grad_ptr,
+        /*in_dim=*/in_dim.int_from_positive_int(),
+        /*out_dim=*/out_dim.int_from_positive_int(),
+        /*batch_size=*/batch_size.int_from_positive_int());
+  } else {
+    ASSERT(stream.is_cpu());
+    ASSERT(per_device_state == std::nullopt);
+    linear_cpu_backward_kernel(
+        /*attrs=*/attrs,
+        /*output=*/output,
+        /*output_grad=*/output_grad,
+        /*input=*/input,
+        /*input_grad=*/input_grad,
+        /*kernel=*/kernel,
+        /*kernel_grad=*/kernel_grad,
+        /*bias_grad=*/bias_grad);
+  }
+}
+
+void linear_cleanup_kernel(
+    DeviceType device_type,
+    std::optional<LinearPerDeviceState> &per_device_state) {
+  if (device_type == DeviceType::GPU) {
+    gpu_cleanup_kernel(per_device_state.value());
+  } else {
+    ASSERT(device_type == DeviceType::CPU);
+    ASSERT(per_device_state == std::nullopt);
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/linear_kernels_cpu.cc b/lib/kernels/src/kernels/linear_kernels_cpu.cc
new file mode 100644
index 0000000000..f26df8081e
--- /dev/null
+++ b/lib/kernels/src/kernels/linear_kernels_cpu.cc
@@ -0,0 +1,96 @@
+#include "kernels/linear_kernels_cpu.h"
+#include "kernels/local_cpu_allocator.h"
+#include "kernels/map_tensor_accessors.h"
+#include "kernels/tensor_accessor_binary_ops.h"
+#include "kernels/tensor_accessor_unary_ops.h"
+#include "utils/exception.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
+#include <libassert/assert.hpp>
+
+namespace FlexFlow {
+
+void linear_cpu_forward_kernel(
+    LinearAttrs const &attrs,
+    GenericTensorAccessorR const &input,
+    GenericTensorAccessorW const &output,
+    GenericTensorAccessorR const &projection,
+    std::optional<GenericTensorAccessorR> const &bias) {
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+  tensor_accessor_matmul_to(
+      input, tensor_accessor_transpose(projection, cpu_allocator), output);
+
+  ASSERT(attrs.use_bias == bias.has_value());
+  if (bias.has_value()) {
+    GenericTensorAccessorW broadcasted_bias = tensor_accessor_broadcast(
+        bias.value(), output.shape.dims, cpu_allocator);
+    tensor_accessor_elementwise_add_to(
+        read_only_accessor_from_write_accessor(output),
+        read_only_accessor_from_write_accessor(broadcasted_bias),
+        output);
+  }
+
+  if (attrs.activation.has_value()) {
+    switch (attrs.activation.value()) {
+      case Activation::RELU:
+        tensor_accessor_relu_to(read_only_accessor_from_write_accessor(output),
+                                output);
+        break;
+      default:
+        PANIC("Unhandled activation function", attrs.activation.value());
+    }
+  }
+}
+
+// template <typename T>
+static float single_element_relu_bwd(float elem) {
+  if (elem > 0) {
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+void linear_cpu_backward_kernel(
+    LinearAttrs const &attrs,
+    GenericTensorAccessorR const &output,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &input,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorR const &projection,
+    GenericTensorAccessorW const &projection_grad,
+    std::optional<GenericTensorAccessorW> const &bias_grad) {
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+  std::optional<GenericTensorAccessorR> processed_output_grad = std::nullopt;
+  if (attrs.activation.has_value()) {
+    switch (attrs.activation.value()) {
+      case Activation::RELU:
+        processed_output_grad =
+            read_only_accessor_from_write_accessor(map_tensor_accessor(
+                output_grad, single_element_relu_bwd, cpu_allocator));
+        break;
+      default:
+        PANIC("Unhandled activation function", attrs.activation.value());
+    }
+  } else {
+    processed_output_grad = output_grad;
+  }
+
+  tensor_accessor_matmul_to(
+      processed_output_grad.value(), projection, input_grad);
+  tensor_accessor_transpose_to(
+      tensor_accessor_matmul(
+          read_only_accessor_from_write_accessor(
+              tensor_accessor_transpose(input, cpu_allocator)),
+          processed_output_grad.value(),
+          cpu_allocator),
+      projection_grad);
+
+  if (bias_grad.has_value()) {
+    tensor_accessor_reduce_to(
+        processed_output_grad.value(), ff_dim_t{0_n}, bias_grad.value());
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/loss_function_kernels.cc b/lib/kernels/src/kernels/loss_function_kernels.cc
new file mode 100644
index 0000000000..df96bab9b0
--- /dev/null
+++ b/lib/kernels/src/kernels/loss_function_kernels.cc
@@ -0,0 +1,126 @@
+#include "kernels/loss_function_kernels.h"
+#include "kernels/loss_function_kernels_cpu.h"
+#include "kernels/loss_function_kernels_gpu.h"
+#include <libassert/assert.hpp>
+
+namespace FlexFlow {
+
+void sparse_categorical_crossentropy_loss_backward_kernel(
+    device_stream_t const &stream,
+    float *logit_grad_ptr,
+    float const *logit_ptr,
+    int const *label_ptr,
+    size_t logit_volume,
+    size_t logit_grad_volume,
+    int num_samples,
+    int num_classes,
+    int k,
+    float scale_factor) {
+  if (stream.is_gpu()) {
+    sparse_categorical_crossentropy_loss_backward_gpu_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*logit_grad_ptr=*/logit_grad_ptr,
+        /*logit_ptr=*/logit_ptr,
+        /*label_ptr=*/label_ptr,
+        /*logit_volume=*/logit_volume,
+        /*logit_grad_volume=*/logit_grad_volume,
+        /*num_samples=*/num_samples,
+        /*num_classes=*/num_classes,
+        /*k=*/k,
+        /*scale_factor=*/scale_factor);
+  } else {
+    ASSERT(stream.is_cpu());
+    sparse_categorical_crossentropy_loss_backward_cpu_kernel(
+        /*logit_grad_ptr=*/logit_grad_ptr,
+        /*logit_ptr=*/logit_ptr,
+        /*label_ptr=*/label_ptr,
+        /*logit_volume=*/logit_volume,
+        /*logit_grad_volume=*/logit_grad_volume,
+        /*num_samples=*/num_samples,
+        /*num_classes=*/num_classes,
+        /*k=*/k,
+        /*scale_factor=*/scale_factor);
+  }
+}
+
+void categorical_crossentropy_loss_backward_kernel(
+    device_stream_t const &stream,
+    GenericTensorAccessorW const &logit_grad,
+    GenericTensorAccessorR const &logit,
+    GenericTensorAccessorR const &label,
+    float scale_factor) {
+  if (stream.is_gpu()) {
+    categorical_crossentropy_loss_backward_gpu_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*logit_grad_ptr=*/logit_grad.get_float_ptr(),
+        /*logit_ptr=*/logit.get_float_ptr(),
+        /*label_ptr=*/label.get_float_ptr(),
+        /*logit_volume=*/
+        get_num_elements(logit.shape.dims).int_from_positive_int(),
+        /*logit_grad_volume=*/
+        get_num_elements(logit_grad.shape.dims).int_from_positive_int(),
+        /*scale_factor=*/scale_factor);
+  } else {
+    ASSERT(stream.is_cpu());
+    categorical_crossentropy_loss_backward_cpu_kernel(
+        /*logit_grad=*/logit_grad,
+        /*logit=*/logit,
+        /*label=*/label,
+        /*scale_factor=*/scale_factor);
+  }
+}
+
+void mean_squared_error_avg_loss_backward_kernel(device_stream_t const &stream,
+                                                 float *logit_grad_ptr,
+                                                 float const *logit_ptr,
+                                                 float const *label_ptr,
+                                                 size_t logit_volume,
+                                                 size_t logit_grad_volume,
+                                                 float scale_factor) {
+  if (stream.is_gpu()) {
+    mean_squared_error_avg_loss_backward_gpu_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*logit_grad_ptr=*/logit_grad_ptr,
+        /*logit_ptr=*/logit_ptr,
+        /*label_ptr=*/label_ptr,
+        /*logit_volume=*/logit_volume,
+        /*logit_grad_volume=*/logit_grad_volume,
+        /*scale_factor=*/scale_factor);
+  } else {
+    ASSERT(stream.is_cpu());
+    mean_squared_error_avg_loss_backward_cpu_kernel(
+        /*logit_grad_ptr=*/logit_grad_ptr,
+        /*logit_ptr=*/logit_ptr,
+        /*label_ptr=*/label_ptr,
+        /*logit_volume=*/logit_volume,
+        /*logit_grad_volume=*/logit_grad_volume,
+        /*scale_factor=*/scale_factor);
+  }
+}
+
+void identity_loss_backward_kernel(device_stream_t const &stream,
+                                   float *loss_grad_ptr,
+                                   float const *loss_ptr,
+                                   size_t loss_volume,
+                                   size_t loss_grad_volume,
+                                   float csale_factor) {
+  if (stream.is_gpu()) {
+    identity_loss_backward_gpu_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*loss_grad_ptr=*/loss_grad_ptr,
+        /*loss_ptr=*/loss_ptr,
+        /*loss_volume=*/loss_volume,
+        /*loss_grad_volume=*/loss_grad_volume,
+        /*csale_factor=*/csale_factor);
+  } else {
+    ASSERT(stream.is_cpu());
+    identity_loss_backward_cpu_kernel(
+        /*loss_grad_ptr=*/loss_grad_ptr,
+        /*loss_ptr=*/loss_ptr,
+        /*loss_volume=*/loss_volume,
+        /*loss_grad_volume=*/loss_grad_volume,
+        /*csale_factor=*/csale_factor);
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/loss_function_kernels_cpu.cc b/lib/kernels/src/kernels/loss_function_kernels_cpu.cc
new file mode 100644
index 0000000000..b0d5f95558
--- /dev/null
+++ b/lib/kernels/src/kernels/loss_function_kernels_cpu.cc
@@ -0,0 +1,51 @@
+#include "kernels/loss_function_kernels_cpu.h"
+#include "kernels/tensor_accessor_binary_ops.h"
+#include "kernels/tensor_accessor_unary_ops.h"
+#include "op-attrs/datatype_value.h"
+#include "utils/exception.h"
+
+namespace FlexFlow {
+
+void sparse_categorical_crossentropy_loss_backward_cpu_kernel(
+    float *logit_grad_ptr,
+    float const *logit_ptr,
+    int const *label_ptr,
+    size_t logit_volume,
+    size_t logit_grad_volume,
+    int num_samples,
+    int num_classes,
+    int k,
+    float scale_factor) {
+  NOT_IMPLEMENTED();
+}
+
+void categorical_crossentropy_loss_backward_cpu_kernel(
+    GenericTensorAccessorW const &logit_grad,
+    GenericTensorAccessorR const &logit,
+    GenericTensorAccessorR const &label,
+    float scale_factor) {
+  tensor_accessor_elementwise_subtract_to(
+      /*lhs=*/logit,
+      /*rhs=*/label,
+      /*output=*/logit_grad);
+  tensor_accessor_scale_by_constant_inplace(logit_grad, scale_factor);
+}
+
+void mean_squared_error_avg_loss_backward_cpu_kernel(float *logit_grad_ptr,
+                                                     float const *logit_ptr,
+                                                     float const *label_ptr,
+                                                     size_t logit_volume,
+                                                     size_t logit_grad_volume,
+                                                     float scale_factor) {
+  NOT_IMPLEMENTED();
+}
+
+void identity_loss_backward_cpu_kernel(float *loss_grad_ptr,
+                                       float const *loss_ptr,
+                                       size_t loss_volume,
+                                       size_t loss_grad_volume,
+                                       float csale_factor) {
+  NOT_IMPLEMENTED();
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/optimizer_kernels.cc b/lib/kernels/src/kernels/optimizer_kernels.cc
new file mode 100644
index 0000000000..3d92d9ddc9
--- /dev/null
+++ b/lib/kernels/src/kernels/optimizer_kernels.cc
@@ -0,0 +1,98 @@
+#include "kernels/optimizer_kernels.h"
+#include "kernels/optimizer_kernels_cpu.h"
+#include "kernels/optimizer_kernels_gpu.h"
+#include <libassert/assert.hpp>
+
+namespace FlexFlow {
+
+void sgd_update_task(device_stream_t const &stream,
+                     device_handle_t const &handle,
+                     float lr,
+                     float momentum,
+                     bool nesterov,
+                     float weight_decay,
+                     GenericTensorAccessorR const &weight_grad,
+                     int num_replicas,
+                     GenericTensorAccessorW const &weight,
+                     std::optional<GenericTensorAccessorW> const &sgd_v) {
+  ASSERT(sgd_v.has_value() == (momentum > 0.0f));
+
+  if (stream.is_gpu()) {
+    float *sgd_v_ptr = nullptr;
+    if (momentum > 0.0f) {
+      sgd_v_ptr = sgd_v.value().get_float_ptr();
+    }
+
+    gpu_sgd_nccl_update_task(
+        /*stream=*/stream.require_gpu(),
+        /*lr=*/lr,
+        /*momentum=*/momentum,
+        /*nesterov=*/nesterov,
+        /*weight_decay=*/weight_decay,
+        /*handle=*/handle.require_for_gpu(),
+        /*weight_grad_ptr=*/weight_grad.get_float_ptr(),
+        /*size=*/
+        get_num_elements(weight_grad.shape.dims).int_from_positive_int(),
+        /*weight_ptr=*/weight.get_float_ptr(),
+        /*sgd_v_ptr=*/sgd_v_ptr);
+  } else {
+    ASSERT(stream.is_cpu());
+    ASSERT(handle.is_for_cpu());
+    cpu_sgd_update_task(
+        /*lr=*/lr,
+        /*momentum=*/momentum,
+        /*nesterov=*/nesterov,
+        /*weight_decay=*/weight_decay,
+        /*weight_grad=*/weight_grad,
+        /*weight=*/weight,
+        /*sgd_v=*/sgd_v);
+  }
+}
+
+void adam_update_task(device_stream_t const &stream,
+                      device_handle_t const &handle,
+                      float alpha_t,
+                      float beta1,
+                      float beta2,
+                      float weight_decay,
+                      float epsilon,
+                      float const *weight_grad_ptr,
+                      size_t size,
+                      int num_replicas,
+                      float *weight_ptr,
+                      float *adam_v_ptr,
+                      float *adam_m_ptr) {
+  if (stream.is_gpu()) {
+    ASSERT(stream.is_cpu());
+    gpu_adam_nccl_update_task(
+        /*stream=*/stream.require_gpu(),
+        /*alpha_t=*/alpha_t,
+        /*beta1=*/beta1,
+        /*beta2=*/beta2,
+        /*weight_decay=*/weight_decay,
+        /*epsilon=*/epsilon,
+        /*handle=*/handle.require_for_gpu(),
+        /*weight_grad_ptr=*/weight_grad_ptr,
+        /*size=*/size,
+        /*weight_ptr=*/weight_ptr,
+        /*adam_v_ptr=*/adam_v_ptr,
+        /*adam_m_ptr=*/adam_m_ptr);
+  } else {
+    ASSERT(stream.is_cpu());
+    ASSERT(handle.is_for_cpu());
+    cpu_adam_update_task(
+        /*alpha_t=*/alpha_t,
+        /*beta1=*/beta1,
+        /*beta2=*/beta2,
+        /*weight_decay=*/weight_decay,
+        /*epsilon=*/epsilon,
+        /*weight_grad_ptr=*/weight_grad_ptr,
+        /*size=*/size,
+        /*num_replicas=*/num_replicas,
+        /*weight_ptr=*/weight_ptr,
+        /*adam_v_ptr=*/adam_v_ptr,
+        /*adam_m_ptr=*/adam_m_ptr);
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/optimizer_kernels_cpu.cc b/lib/kernels/src/kernels/optimizer_kernels_cpu.cc
new file mode 100644
index 0000000000..7842215972
--- /dev/null
+++ b/lib/kernels/src/kernels/optimizer_kernels_cpu.cc
@@ -0,0 +1,76 @@
+#include "kernels/optimizer_kernels_cpu.h"
+#include "kernels/format_accessor_contents.h"
+#include "kernels/local_cpu_allocator.h"
+#include "kernels/tensor_accessor_binary_ops.h"
+#include "kernels/tensor_accessor_unary_ops.h"
+#include "utils/exception.h"
+
+namespace FlexFlow {
+
+void cpu_sgd_update_task(float lr,
+                         float momentum,
+                         bool nesterov,
+                         float weight_decay,
+                         GenericTensorAccessorR const &weight_grad,
+                         GenericTensorAccessorW const &weight,
+                         std::optional<GenericTensorAccessorW> const &sgd_v) {
+  // based on sgd_update in lib/kernels/src/cuda/optimizer_kernels.cu
+
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+  std::cerr << "weight_grad=" << format_accessor_r_contents(weight_grad)
+            << std::endl
+            << "weight=" << format_accessor_w_contents(weight) << std::endl;
+
+  GenericTensorAccessorW gt = tensor_accessor_elementwise_add(
+      weight_grad,
+      read_only_accessor_from_write_accessor(tensor_accessor_scale_by_constant(
+          read_only_accessor_from_write_accessor(weight),
+          weight_decay,
+          cpu_allocator)),
+      cpu_allocator);
+
+  if (momentum > 0.0f) {
+    tensor_accessor_scale_by_constant_inplace(sgd_v.value(), momentum);
+    tensor_accessor_elementwise_add_to(
+        read_only_accessor_from_write_accessor(sgd_v.value()),
+        read_only_accessor_from_write_accessor(gt),
+        sgd_v.value());
+
+    if (nesterov) {
+      tensor_accessor_elementwise_add_to(
+          read_only_accessor_from_write_accessor(gt),
+          read_only_accessor_from_write_accessor(
+              tensor_accessor_scale_by_constant(
+                  read_only_accessor_from_write_accessor(sgd_v.value()),
+                  momentum,
+                  cpu_allocator)),
+          gt);
+    } else {
+      copy_accessor_data_to_l_from_r(
+          gt, read_only_accessor_from_write_accessor(sgd_v.value()));
+    }
+  }
+
+  tensor_accessor_elementwise_subtract_to(
+      read_only_accessor_from_write_accessor(weight),
+      read_only_accessor_from_write_accessor(tensor_accessor_scale_by_constant(
+          read_only_accessor_from_write_accessor(gt), lr, cpu_allocator)),
+      weight);
+}
+
+void cpu_adam_update_task(float alpha_t,
+                          float beta1,
+                          float beta2,
+                          float weight_decay,
+                          float epsilon,
+                          float const *weight_grad_ptr,
+                          size_t size,
+                          int num_replicas,
+                          float *weight_ptr,
+                          float *adam_v_ptr,
+                          float *adam_m_ptr) {
+  NOT_IMPLEMENTED();
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/pool_2d_kernels.cc b/lib/kernels/src/kernels/pool_2d_kernels.cc
new file mode 100644
index 0000000000..6ebfc68c86
--- /dev/null
+++ b/lib/kernels/src/kernels/pool_2d_kernels.cc
@@ -0,0 +1,105 @@
+#include "kernels/pool_2d_kernels.h"
+#include "kernels/pool_2d_kernels_cpu.h"
+#include "kernels/pool_2d_kernels_gpu.h"
+
+namespace FlexFlow::Kernels::Pool2D {
+
+std::optional<Pool2DPerDeviceState>
+    init_kernel(DeviceType device_type,
+                device_handle_t const &handle,
+                std::optional<Activation> activation,
+                int input_w,
+                int input_h,
+                int input_c,
+                int input_n,
+                int output_w,
+                int output_h,
+                int output_c,
+                int output_n,
+                int pad_h,
+                int pad_w,
+                int kernel_h,
+                int kernel_w,
+                int stride_h,
+                int stride_w,
+                PoolOp pool_type) {
+  if (device_type == DeviceType::GPU) {
+    return gpu_init_kernel(
+        /*handle=*/handle.require_for_gpu(),
+        /*activation=*/activation,
+        /*input_w=*/input_w,
+        /*input_h=*/input_h,
+        /*input_c=*/input_c,
+        /*input_n=*/input_n,
+        /*output_w=*/output_w,
+        /*output_h=*/output_h,
+        /*output_c=*/output_c,
+        /*output_n=*/output_n,
+        /*pad_h=*/pad_h,
+        /*pad_w=*/pad_w,
+        /*kernel_h=*/kernel_h,
+        /*kernel_w=*/kernel_w,
+        /*stride_h=*/stride_h,
+        /*stride_w=*/stride_w,
+        /*pool_type=*/pool_type);
+  } else {
+    ASSERT(device_type == DeviceType::CPU);
+    ASSERT(handle.is_for_cpu());
+    return std::nullopt;
+  }
+}
+
+void forward_kernel(device_stream_t const &stream,
+                    std::optional<Pool2DPerDeviceState> const &per_device_state,
+                    void const *input_ptr,
+                    void *output_ptr) {
+  if (stream.is_gpu()) {
+    gpu_forward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*per_device_state=*/per_device_state.value(),
+        /*input_ptr=*/input_ptr,
+        /*output_ptr=*/output_ptr);
+  } else {
+    ASSERT(stream.is_cpu());
+    cpu_forward_kernel(
+        /*input_ptr=*/input_ptr,
+        /*output_ptr=*/output_ptr);
+  }
+}
+
+void backward_kernel(
+    device_stream_t const &stream,
+    std::optional<Pool2DPerDeviceState> const &per_device_state,
+    void const *output_ptr,
+    void const *output_grad_ptr,
+    void const *input_ptr,
+    void *input_grad_ptr) {
+  if (stream.is_gpu()) {
+    gpu_backward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*per_device_state=*/per_device_state.value(),
+        /*output_ptr=*/output_ptr,
+        /*output_grad_ptr=*/output_grad_ptr,
+        /*input_ptr=*/input_ptr,
+        /*input_grad_ptr=*/input_grad_ptr);
+  } else {
+    ASSERT(stream.is_cpu());
+    cpu_backward_kernel(
+        /*output_ptr=*/output_ptr,
+        /*output_grad_ptr=*/output_grad_ptr,
+        /*input_ptr=*/input_ptr,
+        /*input_grad_ptr=*/input_grad_ptr);
+  }
+}
+
+void cleanup_kernel(DeviceType device_type,
+                    std::optional<Pool2DPerDeviceState> &per_device_state) {
+  if (device_type == DeviceType::GPU) {
+    gpu_cleanup_kernel(per_device_state.value());
+  } else {
+    ASSERT(device_type == DeviceType::CPU);
+    ASSERT(per_device_state == std::nullopt);
+  }
+}
+
+} // namespace FlexFlow::Kernels::Pool2D
diff --git a/lib/kernels/src/kernels/pool_2d_kernels_cpu.cc b/lib/kernels/src/kernels/pool_2d_kernels_cpu.cc
new file mode 100644
index 0000000000..f2d2141e96
--- /dev/null
+++ b/lib/kernels/src/kernels/pool_2d_kernels_cpu.cc
@@ -0,0 +1,17 @@
+#include "kernels/pool_2d_kernels_cpu.h"
+#include "utils/exception.h"
+
+namespace FlexFlow::Kernels::Pool2D {
+
+void cpu_forward_kernel(void const *input_ptr, void *output_ptr) {
+  NOT_IMPLEMENTED();
+}
+
+void cpu_backward_kernel(void const *output_ptr,
+                         void const *output_grad_ptr,
+                         void const *input_ptr,
+                         void *input_grad_ptr) {
+  NOT_IMPLEMENTED();
+}
+
+} // namespace FlexFlow::Kernels::Pool2D
diff --git a/lib/kernels/src/kernels/reduce_kernels.cc b/lib/kernels/src/kernels/reduce_kernels.cc
new file mode 100644
index 0000000000..bd3d6a8cd1
--- /dev/null
+++ b/lib/kernels/src/kernels/reduce_kernels.cc
@@ -0,0 +1,62 @@
+#include "kernels/reduce_kernels.h"
+#include "kernels/reduce_kernels_cpu.h"
+#include "kernels/reduce_kernels_gpu.h"
+
+namespace FlexFlow::Kernels::Reduce {
+
+std::optional<ReducePerDeviceState>
+    init_kernel(DeviceType device_type,
+                device_handle_t const &handle,
+                OperatorType const &operator_type,
+                size_t const &reduction_size,
+                TensorShape const &input_shape,
+                TensorShape const &output_shape) {
+  if (device_type == DeviceType::GPU) {
+    return gpu_init_kernel(/*handle=*/handle.require_for_gpu(),
+                           /*operator_type=*/operator_type,
+                           /*reduction_size=*/reduction_size,
+                           /*input_shape=*/input_shape,
+                           /*output_shape=*/output_shape);
+  } else {
+    ASSERT(device_type == DeviceType::CPU);
+    ASSERT(handle.is_for_cpu());
+    return std::nullopt;
+  }
+}
+
+void forward_kernel(device_stream_t const &stream,
+                    std::optional<ReducePerDeviceState> const &per_device_state,
+                    float const *input_ptr,
+                    float *output_ptr) {
+  if (stream.is_gpu()) {
+    gpu_forward_kernel(/*stream=*/stream.require_gpu(),
+                       /*per_device_state=*/per_device_state.value(),
+                       /*input_ptr=*/input_ptr,
+                       /*output_ptr=*/output_ptr);
+  } else {
+    ASSERT(stream.is_cpu());
+    ASSERT(per_device_state == std::nullopt);
+    cpu_forward_kernel(/*input_ptr=*/input_ptr,
+                       /*output_ptr=*/output_ptr);
+  }
+}
+
+void backward_kernel(
+    device_stream_t const &stream,
+    std::optional<ReducePerDeviceState> const &per_device_state,
+    float const *output_grad_ptr,
+    float *input_grad_ptr) {
+  if (stream.is_gpu()) {
+    gpu_backward_kernel(/*stream=*/stream.require_gpu(),
+                        /*per_device_state=*/per_device_state.value(),
+                        /*output_grad_ptr=*/output_grad_ptr,
+                        /*input_grad_ptr=*/input_grad_ptr);
+  } else {
+    ASSERT(stream.is_cpu());
+    ASSERT(per_device_state == std::nullopt);
+    cpu_backward_kernel(/*output_grad_ptr=*/output_grad_ptr,
+                        /*input_grad_ptr=*/input_grad_ptr);
+  }
+}
+
+} // namespace FlexFlow::Kernels::Reduce
diff --git a/lib/kernels/src/kernels/reduce_kernels_cpu.cc b/lib/kernels/src/kernels/reduce_kernels_cpu.cc
new file mode 100644
index 0000000000..295e126b49
--- /dev/null
+++ b/lib/kernels/src/kernels/reduce_kernels_cpu.cc
@@ -0,0 +1,14 @@
+#include "kernels/reduce_kernels_cpu.h"
+#include "utils/exception.h"
+
+namespace FlexFlow::Kernels::Reduce {
+
+void cpu_forward_kernel(float const *input_ptr, float *output_ptr) {
+  NOT_IMPLEMENTED();
+}
+
+void cpu_backward_kernel(float const *output_grad_ptr, float *input_grad_ptr) {
+  NOT_IMPLEMENTED();
+}
+
+} // namespace FlexFlow::Kernels::Reduce
diff --git a/lib/kernels/src/kernels/reshape_kernels.cc b/lib/kernels/src/kernels/reshape_kernels.cc
new file mode 100644
index 0000000000..2ac90352bb
--- /dev/null
+++ b/lib/kernels/src/kernels/reshape_kernels.cc
@@ -0,0 +1,39 @@
+#include "kernels/reshape_kernels.h"
+#include "kernels/reshape_kernels_cpu.h"
+#include "kernels/reshape_kernels_gpu.h"
+
+namespace FlexFlow::Kernels::Reshape {
+
+void forward_kernel(device_stream_t const &stream,
+                    GenericTensorAccessorR const &input,
+                    GenericTensorAccessorW const &output) {
+  if (stream.is_gpu()) {
+    gpu_forward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*input=*/input,
+        /*output=*/output);
+  } else {
+    ASSERT(stream.is_cpu());
+    cpu_forward_kernel(
+        /*input=*/input,
+        /*output=*/output);
+  }
+}
+
+void backward_kernel(device_stream_t const &stream,
+                     GenericTensorAccessorR const &output,
+                     GenericTensorAccessorW const &input) {
+  if (stream.is_gpu()) {
+    gpu_backward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*output=*/output,
+        /*input=*/input);
+  } else {
+    ASSERT(stream.is_cpu());
+    cpu_backward_kernel(
+        /*output=*/output,
+        /*input=*/input);
+  }
+}
+
+} // namespace FlexFlow::Kernels::Reshape
diff --git a/lib/kernels/src/kernels/reshape_kernels_cpu.cc b/lib/kernels/src/kernels/reshape_kernels_cpu.cc
new file mode 100644
index 0000000000..b48272cdde
--- /dev/null
+++ b/lib/kernels/src/kernels/reshape_kernels_cpu.cc
@@ -0,0 +1,15 @@
+#include "kernels/reshape_kernels_cpu.h"
+
+namespace FlexFlow::Kernels::Reshape {
+
+void cpu_forward_kernel(GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output) {
+  NOT_IMPLEMENTED();
+}
+
+void cpu_backward_kernel(GenericTensorAccessorR const &output,
+                         GenericTensorAccessorW const &input) {
+  NOT_IMPLEMENTED();
+}
+
+} // namespace FlexFlow::Kernels::Reshape
diff --git a/lib/kernels/src/kernels/reverse_kernels.cc b/lib/kernels/src/kernels/reverse_kernels.cc
new file mode 100644
index 0000000000..301c60ea3d
--- /dev/null
+++ b/lib/kernels/src/kernels/reverse_kernels.cc
@@ -0,0 +1,33 @@
+#include "kernels/reverse_kernels.h"
+#include "kernels/reverse_kernels_cpu.h"
+#include "kernels/reverse_kernels_gpu.h"
+
+namespace FlexFlow::Kernels::Reverse {
+
+void forward_kernel(device_stream_t const &stream,
+                    GenericTensorAccessorR const &input_accessor,
+                    GenericTensorAccessorW &output_accessor,
+                    ReverseAttrs const &attrs) {
+  if (stream.is_gpu()) {
+    gpu_forward_kernel(
+        stream.require_gpu(), input_accessor, output_accessor, attrs);
+  } else {
+    ASSERT(stream.is_cpu());
+    cpu_forward_kernel(input_accessor, output_accessor, attrs);
+  }
+}
+
+void backward_kernel(device_stream_t const &stream,
+                     GenericTensorAccessorR const &output_accessor,
+                     GenericTensorAccessorW &input_accessor,
+                     ReverseAttrs const &attrs) {
+  if (stream.is_gpu()) {
+    gpu_backward_kernel(
+        stream.require_gpu(), output_accessor, input_accessor, attrs);
+  } else {
+    ASSERT(stream.is_cpu());
+    cpu_backward_kernel(output_accessor, input_accessor, attrs);
+  }
+}
+
+} // namespace FlexFlow::Kernels::Reverse
diff --git a/lib/kernels/src/cpu/ops/reverse_kernels.cc b/lib/kernels/src/kernels/reverse_kernels_cpu.cc
similarity index 64%
rename from lib/kernels/src/cpu/ops/reverse_kernels.cc
rename to lib/kernels/src/kernels/reverse_kernels_cpu.cc
index 212a52881a..e21c986dd8 100644
--- a/lib/kernels/src/cpu/ops/reverse_kernels.cc
+++ b/lib/kernels/src/kernels/reverse_kernels_cpu.cc
@@ -1,5 +1,5 @@
-#include "kernels/datatype_dispatch.h"
 #include "kernels/reverse_kernels_cpu.h"
+#include "kernels/datatype_dispatch.h"
 #include <vector>
 
 namespace FlexFlow::Kernels::Reverse {
@@ -9,19 +9,19 @@ struct CPUReverseForwardKernel {
   void operator()(GenericTensorAccessorR const &input,
                   GenericTensorAccessorW &output,
                   ReverseAttrs const &attrs) {
-    positive_int reverse_axis_size = input.shape.at(attrs.axis);
+    positive_int reverse_axis_size = dim_at_idx(input.shape.dims, attrs.axis);
 
-    for (ArrayCoord const &input_coord : get_array_coord_set(input.shape)) {
+    for (TensorDimsCoord const &input_coord :
+         get_tensor_dims_coord_set(input.shape.dims)) {
       nonnegative_int input_reverse_axis_coord =
           input_coord.ff_ordered.at(attrs.axis);
 
-      ArrayCoord output_coord = input_coord;
+      TensorDimsCoord output_coord = input_coord;
       output_coord.ff_ordered.at(attrs.axis) =
           nonnegative_int{reverse_axis_size.int_from_positive_int() -
                           input_reverse_axis_coord.unwrap_nonnegative() - 1};
 
-      output.at<DT>(output_coord.ff_ordered) =
-          input.at<DT>(input_coord.ff_ordered);
+      output.at<DT>(output_coord) = input.at<DT>(input_coord);
     }
   }
 };
@@ -31,16 +31,17 @@ void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor,
                         ReverseAttrs const &attrs) {
 
   DataTypeDispatch1<CPUReverseForwardKernel>{}(
-      input_accessor.data_type, input_accessor, output_accessor, attrs);
+      input_accessor.shape.data_type, input_accessor, output_accessor, attrs);
 }
 
 void cpu_backward_kernel(GenericTensorAccessorR const &output_grad_accessor,
                          GenericTensorAccessorW &input_grad_accessor,
                          ReverseAttrs const &attrs) {
-  DataTypeDispatch1<CPUReverseForwardKernel>{}(output_grad_accessor.data_type,
-                                               output_grad_accessor,
-                                               input_grad_accessor,
-                                               attrs);
+  DataTypeDispatch1<CPUReverseForwardKernel>{}(
+      output_grad_accessor.shape.data_type,
+      output_grad_accessor,
+      input_grad_accessor,
+      attrs);
 }
 
 } // namespace FlexFlow::Kernels::Reverse
diff --git a/lib/kernels/src/kernels/reverse_kernels_params.cc b/lib/kernels/src/kernels/reverse_kernels_params.cc
index 0ad1a5ed20..cf72fb3eef 100644
--- a/lib/kernels/src/kernels/reverse_kernels_params.cc
+++ b/lib/kernels/src/kernels/reverse_kernels_params.cc
@@ -1,29 +1,31 @@
 #include "kernels/reverse_kernels_params.h"
+#include "op-attrs/tensor_dims.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
 
 namespace FlexFlow {
 
 ReverseKernelsParams
-    compute_reverse_kernels_params(ArrayShape const &output_shape,
+    compute_reverse_kernels_params(TensorDims const &output_dims,
                                    ReverseAttrs const &attrs) {
   auto axis = attrs.axis;
   positive_int in_blk_size = 1_p;
   positive_int reverse_dim_size = 1_p;
   positive_int num_out_blks = 1_p;
-  for (nonnegative_int i : nonnegative_range(output_shape.num_dims())) {
+  for (nonnegative_int i : nonnegative_range(get_num_dims(output_dims))) {
     if (i < axis.value) {
-      in_blk_size *= output_shape.at(ff_dim_t{i});
+      in_blk_size *= dim_at_idx(output_dims, ff_dim_t{i});
     } else if (i == axis.value) {
-      reverse_dim_size = output_shape.at(ff_dim_t{i});
+      reverse_dim_size = dim_at_idx(output_dims, ff_dim_t{i});
     } else {
-      num_out_blks *= output_shape.at(ff_dim_t{i});
+      num_out_blks *= dim_at_idx(output_dims, ff_dim_t{i});
     }
   }
 
   return ReverseKernelsParams{
-      num_out_blks,
-      reverse_dim_size,
-      in_blk_size,
-      output_shape.num_elements(),
+      /*num_out_blks=*/num_out_blks,
+      /*reverse_dim_size=*/reverse_dim_size,
+      /*in_blk_size=*/in_blk_size,
+      /*out_size=*/get_num_elements(output_dims),
   };
 }
 
diff --git a/lib/kernels/src/kernels/softmax_kernels.cc b/lib/kernels/src/kernels/softmax_kernels.cc
new file mode 100644
index 0000000000..3cc655dc7c
--- /dev/null
+++ b/lib/kernels/src/kernels/softmax_kernels.cc
@@ -0,0 +1,79 @@
+#include "kernels/softmax_kernels.h"
+#include "kernels/softmax_kernels_cpu.h"
+#include "kernels/softmax_kernels_gpu.h"
+#include <libassert/assert.hpp>
+
+namespace FlexFlow::Kernels::Softmax {
+
+std::optional<SoftmaxPerDeviceState> init_kernel(DeviceType device_type,
+                                                 device_handle_t const &handle,
+                                                 ff_dim_t dim,
+                                                 int input_n,
+                                                 int input_c,
+                                                 int input_h,
+                                                 int input_w) {
+  if (device_type == DeviceType::GPU) {
+    return gpu_init_kernel(
+        /*handle=*/handle.require_for_gpu(),
+        /*dim=*/dim,
+        /*input_n=*/input_n,
+        /*input_c=*/input_c,
+        /*input_h=*/input_h,
+        /*input_w=*/input_w);
+  } else {
+    ASSERT(device_type == DeviceType::CPU);
+    ASSERT(handle.is_for_cpu());
+    return std::nullopt;
+  }
+}
+
+void forward_kernel(
+    device_stream_t const &stream,
+    std::optional<SoftmaxPerDeviceState> const &per_device_state,
+    float const *input_ptr,
+    float *output_ptr) {
+  if (stream.is_gpu()) {
+    gpu_forward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*per_device_state=*/per_device_state.value(),
+        /*input_ptr=*/input_ptr,
+        /*output_ptr=*/output_ptr);
+  } else {
+    ASSERT(stream.is_cpu());
+    ASSERT(per_device_state == std::nullopt);
+    cpu_forward_kernel(
+        /*input_ptr=*/input_ptr,
+        /*output_ptr=*/output_ptr);
+  }
+}
+
+void backward_kernel(device_stream_t const &stream,
+                     float const *output_grad_ptr,
+                     float *input_grad_ptr,
+                     size_t num_elements) {
+  if (stream.is_gpu()) {
+    gpu_backward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*output_grad_ptr=*/output_grad_ptr,
+        /*input_grad_ptr=*/input_grad_ptr,
+        /*num_elements=*/num_elements);
+  } else {
+    ASSERT(stream.is_cpu());
+    cpu_backward_kernel(
+        /*output_grad_ptr=*/output_grad_ptr,
+        /*input_grad_ptr=*/input_grad_ptr,
+        /*num_elements=*/num_elements);
+  }
+}
+
+void cleanup_kernel(DeviceType device_type,
+                    std::optional<SoftmaxPerDeviceState> &per_device_state) {
+  if (device_type == DeviceType::GPU) {
+    gpu_cleanup_kernel(per_device_state.value());
+  } else {
+    ASSERT(device_type == DeviceType::CPU);
+    ASSERT(per_device_state == std::nullopt);
+  }
+}
+
+} // namespace FlexFlow::Kernels::Softmax
diff --git a/lib/kernels/src/kernels/softmax_kernels_cpu.cc b/lib/kernels/src/kernels/softmax_kernels_cpu.cc
new file mode 100644
index 0000000000..20f9b68299
--- /dev/null
+++ b/lib/kernels/src/kernels/softmax_kernels_cpu.cc
@@ -0,0 +1,16 @@
+#include "kernels/softmax_kernels_cpu.h"
+#include "utils/exception.h"
+
+namespace FlexFlow::Kernels::Softmax {
+
+void cpu_forward_kernel(float const *input_ptr, float *output_ptr) {
+  NOT_IMPLEMENTED();
+}
+
+void cpu_backward_kernel(float const *output_grad_ptr,
+                         float *input_grad_ptr,
+                         size_t num_elements) {
+  NOT_IMPLEMENTED();
+}
+
+} // namespace FlexFlow::Kernels::Softmax
diff --git a/lib/kernels/src/kernels/split_kernels.cc b/lib/kernels/src/kernels/split_kernels.cc
new file mode 100644
index 0000000000..f38ae2e8af
--- /dev/null
+++ b/lib/kernels/src/kernels/split_kernels.cc
@@ -0,0 +1,63 @@
+#include "kernels/split_kernels.h"
+#include "kernels/split_kernels_cpu.h"
+#include "kernels/split_kernels_gpu.h"
+#include <libassert/assert.hpp>
+
+namespace FlexFlow::Kernels::Split {
+
+void forward_kernel(device_stream_t const &stream,
+                    float **out_ptrs,
+                    float const *in_ptr,
+                    int const *out_blk_sizes,
+                    int in_blk_size,
+                    int num_blks,
+                    int numOutputs) {
+  if (stream.is_gpu()) {
+    gpu_forward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*out_ptrs=*/out_ptrs,
+        /*in_ptr=*/in_ptr,
+        /*out_blk_sizes=*/out_blk_sizes,
+        /*in_blk_size=*/in_blk_size,
+        /*num_blks=*/num_blks,
+        /*numOutputs=*/numOutputs);
+  } else {
+    cpu_forward_kernel(
+        /*out_ptrs=*/out_ptrs,
+        /*in_ptr=*/in_ptr,
+        /*out_blk_sizes=*/out_blk_sizes,
+        /*in_blk_size=*/in_blk_size,
+        /*num_blks=*/num_blks,
+        /*numOutputs=*/numOutputs);
+  }
+}
+
+void backward_kernel(device_stream_t const &stream,
+                     float *in_grad_ptr,
+                     float const **out_grad_ptr,
+                     int const *out_blk_sizes,
+                     int in_blk_size,
+                     int num_blks,
+                     int numOutputs) {
+  if (stream.is_gpu()) {
+    gpu_backward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*in_grad_ptr=*/in_grad_ptr,
+        /*out_grad_ptr=*/out_grad_ptr,
+        /*out_blk_sizes=*/out_blk_sizes,
+        /*in_blk_size=*/in_blk_size,
+        /*num_blks=*/num_blks,
+        /*numOutputs=*/numOutputs);
+  } else {
+    ASSERT(stream.is_cpu());
+    cpu_backward_kernel(
+        /*in_grad_ptr=*/in_grad_ptr,
+        /*out_grad_ptr=*/out_grad_ptr,
+        /*out_blk_sizes=*/out_blk_sizes,
+        /*in_blk_size=*/in_blk_size,
+        /*num_blks=*/num_blks,
+        /*numOutputs=*/numOutputs);
+  }
+}
+
+} // namespace FlexFlow::Kernels::Split
diff --git a/lib/kernels/src/kernels/split_kernels_cpu.cc b/lib/kernels/src/kernels/split_kernels_cpu.cc
new file mode 100644
index 0000000000..1639848ef4
--- /dev/null
+++ b/lib/kernels/src/kernels/split_kernels_cpu.cc
@@ -0,0 +1,24 @@
+#include "kernels/split_kernels_cpu.h"
+#include "utils/exception.h"
+
+namespace FlexFlow::Kernels::Split {
+
+void cpu_forward_kernel(float **out_ptrs,
+                        float const *in_ptr,
+                        int const *out_blk_sizes,
+                        int in_blk_size,
+                        int num_blks,
+                        int numOutputs) {
+  NOT_IMPLEMENTED();
+}
+
+void cpu_backward_kernel(float *in_grad_ptr,
+                         float const **out_grad_ptr,
+                         int const *out_blk_sizes,
+                         int in_blk_size,
+                         int num_blks,
+                         int numOutputs) {
+  NOT_IMPLEMENTED();
+}
+
+} // namespace FlexFlow::Kernels::Split
diff --git a/lib/kernels/src/kernels/tensor_accessor_binary_ops.cc b/lib/kernels/src/kernels/tensor_accessor_binary_ops.cc
new file mode 100644
index 0000000000..db7830f926
--- /dev/null
+++ b/lib/kernels/src/kernels/tensor_accessor_binary_ops.cc
@@ -0,0 +1,143 @@
+#include "kernels/tensor_accessor_binary_ops.h"
+#include "kernels/map_tensor_accessors.h"
+#include "op-attrs/tensor_shape.h"
+
+namespace FlexFlow {
+
+GenericTensorAccessorW
+    tensor_accessor_elementwise_add(GenericTensorAccessorR const &lhs,
+                                    GenericTensorAccessorR const &rhs,
+                                    Allocator &output_allocator) {
+  return map_tensor_accessors2(
+      lhs,
+      rhs,
+      require_same(lhs.shape.data_type, rhs.shape.data_type),
+      [](auto const &l, auto const &r) { return l + r; },
+      output_allocator);
+}
+
+void tensor_accessor_elementwise_add_to(GenericTensorAccessorR const &lhs,
+                                        GenericTensorAccessorR const &rhs,
+                                        GenericTensorAccessorW const &output) {
+  map_tensor_accessors2_to(
+      lhs,
+      rhs,
+      require_same(lhs.shape.data_type, rhs.shape.data_type),
+      [](auto const &l, auto const &r) { return l + r; },
+      output);
+}
+
+GenericTensorAccessorW
+    tensor_accessor_elementwise_subtract(GenericTensorAccessorR const &lhs,
+                                         GenericTensorAccessorR const &rhs,
+                                         Allocator &output_allocator) {
+  return map_tensor_accessors2(
+      lhs,
+      rhs,
+      require_same(lhs.shape.data_type, rhs.shape.data_type),
+      [](auto const &l, auto const &r) { return l - r; },
+      output_allocator);
+}
+
+void tensor_accessor_elementwise_subtract_to(
+    GenericTensorAccessorR const &lhs,
+    GenericTensorAccessorR const &rhs,
+    GenericTensorAccessorW const &output) {
+  map_tensor_accessors2_to(
+      lhs,
+      rhs,
+      require_same(lhs.shape.data_type, rhs.shape.data_type),
+      [](auto const &l, auto const &r) { return l - r; },
+      output);
+}
+
+GenericTensorAccessorW
+    tensor_accessor_elementwise_multiply(GenericTensorAccessorR const &lhs,
+                                         GenericTensorAccessorR const &rhs,
+                                         Allocator &output_allocator) {
+  return map_tensor_accessors2(
+      lhs,
+      rhs,
+      require_same(lhs.shape.data_type, rhs.shape.data_type),
+      [](auto const &l, auto const &r) { return l * r; },
+      output_allocator);
+}
+
+void tensor_accessor_elementwise_multiply_to(
+    GenericTensorAccessorR const &lhs,
+    GenericTensorAccessorR const &rhs,
+    GenericTensorAccessorW const &output) {
+  map_tensor_accessors2_to(
+      lhs,
+      rhs,
+      require_same(lhs.shape.data_type, rhs.shape.data_type),
+      [](auto const &l, auto const &r) { return l * r; },
+      output);
+}
+
+static TensorShape get_matmul_output_shape(TensorShape const &lhs,
+                                           TensorShape const &rhs) {
+  ASSERT(get_num_dims(lhs.dims) == 2);
+  ASSERT(get_num_dims(rhs.dims) == 2);
+  ASSERT(lhs.data_type == DataType::FLOAT);
+  ASSERT(rhs.data_type == DataType::FLOAT);
+  ASSERT(dim_at_idx(lhs.dims, relative_ff_dim_t{1}) ==
+         dim_at_idx(rhs.dims, relative_ff_dim_t{0}));
+
+  return TensorShape{
+      TensorDims{FFOrdered{
+          dim_at_idx(lhs.dims, relative_ff_dim_t{0}),
+          dim_at_idx(rhs.dims, relative_ff_dim_t{1}),
+      }},
+      DataType::FLOAT,
+  };
+}
+
+GenericTensorAccessorW tensor_accessor_matmul(GenericTensorAccessorR const &lhs,
+                                              GenericTensorAccessorR const &rhs,
+                                              Allocator &output_allocator) {
+  TensorShape output_shape =
+      get_matmul_output_shape(get_tensor_shape_for_accessor_r(lhs),
+                              get_tensor_shape_for_accessor_r(rhs));
+
+  GenericTensorAccessorW output =
+      output_allocator.allocate_tensor(output_shape);
+
+  tensor_accessor_matmul_to(lhs, rhs, output);
+
+  return output;
+}
+
+void tensor_accessor_matmul_to(GenericTensorAccessorR const &lhs,
+                               GenericTensorAccessorR const &rhs,
+                               GenericTensorAccessorW const &output) {
+  TensorShape output_shape =
+      get_matmul_output_shape(get_tensor_shape_for_accessor_r(lhs),
+                              get_tensor_shape_for_accessor_r(rhs));
+
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorR lhs_cpu =
+      copy_tensor_accessor_r_to_cpu_if_necessary(lhs, cpu_allocator);
+  GenericTensorAccessorR rhs_cpu =
+      copy_tensor_accessor_r_to_cpu_if_necessary(rhs, cpu_allocator);
+  GenericTensorAccessorW output_cpu =
+      cpu_allocator.allocate_tensor(output_shape);
+
+  for (nonnegative_int i :
+       nonnegative_range(dim_at_idx(lhs.shape.dims, ff_dim_t{0_n}))) {
+    for (nonnegative_int j :
+         nonnegative_range(dim_at_idx(rhs.shape.dims, ff_dim_t{1_n}))) {
+      float accum = 0.0f;
+      for (nonnegative_int k :
+           nonnegative_range(dim_at_idx(lhs.shape.dims, ff_dim_t{1_n}))) {
+        accum += lhs_cpu.at<DataType::FLOAT>(TensorDimsCoord{FFOrdered{i, k}}) *
+                 rhs_cpu.at<DataType::FLOAT>(TensorDimsCoord{FFOrdered{k, j}});
+      }
+      output_cpu.at<DataType::FLOAT>(TensorDimsCoord{FFOrdered{i, j}}) = accum;
+    }
+  }
+
+  return copy_accessor_data_to_l_from_r(output, output_cpu);
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/tensor_accessor_reductions.cc b/lib/kernels/src/kernels/tensor_accessor_reductions.cc
index b11791d32c..199de51ff7 100644
--- a/lib/kernels/src/kernels/tensor_accessor_reductions.cc
+++ b/lib/kernels/src/kernels/tensor_accessor_reductions.cc
@@ -5,7 +5,7 @@
 namespace FlexFlow {
 
 bool tensor_accessor_all(GenericTensorAccessorR const &t) {
-  ASSERT(t.data_type == DataType::BOOL);
+  ASSERT(t.shape.data_type == DataType::BOOL);
 
   return reduce_tensor_accessor_in_all_dims<DataType::BOOL>(
       t,
@@ -16,7 +16,7 @@ bool tensor_accessor_all(GenericTensorAccessorR const &t) {
 }
 
 bool tensor_accessor_any(GenericTensorAccessorR const &t) {
-  ASSERT(t.data_type == DataType::BOOL);
+  ASSERT(t.shape.data_type == DataType::BOOL);
 
   return reduce_tensor_accessor_in_all_dims<DataType::BOOL>(
       t,
diff --git a/lib/kernels/src/kernels/tensor_accessor_unary_ops.cc b/lib/kernels/src/kernels/tensor_accessor_unary_ops.cc
new file mode 100644
index 0000000000..0a17e19f80
--- /dev/null
+++ b/lib/kernels/src/kernels/tensor_accessor_unary_ops.cc
@@ -0,0 +1,247 @@
+#include "kernels/tensor_accessor_unary_ops.h"
+#include "kernels/datatype_dispatch.h"
+#include "kernels/fill_tensor_accessor.h"
+#include "kernels/map_tensor_accessors.h"
+#include "op-attrs/datatype_value.h"
+#include "op-attrs/ff_ordered/concat.h"
+#include "op-attrs/ff_ordered/reversed.h"
+#include "op-attrs/ff_ordered/slice.h"
+#include "op-attrs/tensor_dims.h"
+#include "op-attrs/tensor_dims_coord.h"
+
+namespace FlexFlow {
+
+GenericTensorAccessorW
+    tensor_accessor_scale_by_constant(GenericTensorAccessorR const &t,
+                                      float constant,
+                                      Allocator &output_allocator) {
+  ASSERT(t.shape.data_type == DataType::FLOAT);
+
+  return map_tensor_accessor(
+      t, [&](auto const &elem) { return elem * constant; }, output_allocator);
+}
+
+void tensor_accessor_scale_by_constant_inplace(GenericTensorAccessorW const &t,
+                                               float constant) {
+  ASSERT(t.shape.data_type == DataType::FLOAT);
+
+  return map_tensor_accessor_inplace(
+      t, [&](auto const &elem) { return elem * constant; });
+}
+
+template <typename T>
+static T single_element_relu(T elem) {
+  if (elem >= 0) {
+    return elem;
+  } else {
+    return 0;
+  }
+}
+
+void tensor_accessor_relu_to(GenericTensorAccessorR const &input,
+                             GenericTensorAccessorW const &output) {
+  map_tensor_accessor_to(
+      input, [](auto elem) { return single_element_relu(elem); }, output);
+}
+
+GenericTensorAccessorW tensor_accessor_relu(GenericTensorAccessorR const &input,
+                                            Allocator &output_allocator) {
+  return map_tensor_accessor(
+      input,
+      [](auto elem) { return single_element_relu(elem); },
+      output_allocator);
+}
+
+template <DataType DT>
+struct CPUTensorAccessorBroadcast {
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output) {
+
+    for (TensorDimsCoord const &output_coord :
+         get_tensor_dims_coord_set(output.shape.dims)) {
+      TensorDimsCoord input_coord = get_broadcast_src_coord(
+          /*input_dims=*/input.shape.dims,
+          /*output_dims=*/output.shape.dims,
+          /*dst_coord=*/output_coord);
+
+      output.at<DT>(output_coord) = input.at<DT>(input_coord);
+    }
+  }
+};
+
+void tensor_accessor_broadcast_to(GenericTensorAccessorR const &input,
+                                  TensorDims const &output_dims,
+                                  GenericTensorAccessorW const &output) {
+  ASSERT(tensor_dims_is_broadcastable_to(input.shape.dims, output_dims));
+
+  TensorShape output_shape = TensorShape{output_dims, input.shape.data_type};
+  ASSERT(get_tensor_shape_for_accessor_w(output) == output_shape);
+
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorR input_cpu =
+      copy_tensor_accessor_r_to_cpu_if_necessary(input, cpu_allocator);
+
+  GenericTensorAccessorW output_cpu =
+      cpu_allocator.allocate_tensor(output_shape);
+
+  DataTypeDispatch1<CPUTensorAccessorBroadcast>{}(
+      input.shape.data_type, input_cpu, output_cpu);
+
+  copy_accessor_data_to_l_from_r(output, output_cpu);
+}
+
+GenericTensorAccessorW
+    tensor_accessor_broadcast(GenericTensorAccessorR const &input,
+                              TensorDims const &output_dims,
+                              Allocator &output_allocator) {
+
+  TensorShape output_shape = TensorShape{output_dims, input.shape.data_type};
+
+  GenericTensorAccessorW output =
+      output_allocator.allocate_tensor(output_shape);
+
+  tensor_accessor_broadcast_to(input, output_dims, output);
+
+  return output;
+}
+
+template <DataType DT>
+struct CPUTensorAccessorTranspose {
+  void operator()(GenericTensorAccessorR const &input,
+                  GenericTensorAccessorW const &output) {
+    ASSERT(get_num_dims(input.shape.dims) == 2);
+    ASSERT(get_num_dims(output.shape.dims) == 2);
+
+    for (TensorDimsCoord const &input_coord :
+         get_tensor_dims_coord_set(input.shape.dims)) {
+      ASSERT(input_coord.ff_ordered.size() == 2);
+
+      TensorDimsCoord output_coord = TensorDimsCoord{
+          reversed(input_coord.ff_ordered),
+      };
+
+      output.at<DT>(output_coord) = input.at<DT>(input_coord);
+    }
+  }
+};
+
+static TensorShape get_transpose_output_shape(TensorShape const &input_shape) {
+  return TensorShape{
+      TensorDims{
+          reversed(input_shape.dims.ff_ordered),
+      },
+      input_shape.data_type,
+  };
+}
+
+void tensor_accessor_transpose_to(GenericTensorAccessorR const &input,
+                                  GenericTensorAccessorW const &output) {
+  ASSERT(get_num_dims(input.shape.dims) == 2);
+
+  TensorShape output_shape =
+      get_transpose_output_shape(get_tensor_shape_for_accessor_r(input));
+  ASSERT(get_tensor_shape_for_accessor_w(output) == output_shape);
+
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorR input_cpu =
+      copy_tensor_accessor_r_to_cpu_if_necessary(input, cpu_allocator);
+
+  GenericTensorAccessorW output_cpu =
+      cpu_allocator.allocate_tensor(output_shape);
+
+  DataTypeDispatch1<CPUTensorAccessorTranspose>{}(
+      input.shape.data_type, input_cpu, output_cpu);
+
+  copy_accessor_data_to_l_from_r(output, output_cpu);
+}
+
+GenericTensorAccessorW
+    tensor_accessor_transpose(GenericTensorAccessorR const &input,
+                              Allocator &output_allocator) {
+
+  TensorShape output_shape =
+      get_transpose_output_shape(get_tensor_shape_for_accessor_r(input));
+
+  GenericTensorAccessorW output =
+      output_allocator.allocate_tensor(output_shape);
+
+  tensor_accessor_transpose_to(input, output);
+
+  return output;
+}
+
+template <DataType DT>
+struct CPUTensorAccessorReduce {
+  void operator()(GenericTensorAccessorR const &input,
+                  ff_dim_t reduction_dim,
+                  GenericTensorAccessorW const &output) {
+    fill_with_zeros(output);
+
+    for (TensorDimsCoord const &input_coord :
+         get_tensor_dims_coord_set(input.shape.dims)) {
+      TensorDimsCoord output_coord = tensor_dims_coord_drop_dims(
+          input_coord, [&](ff_dim_t input_coord_dim) {
+            return input_coord_dim == reduction_dim;
+          });
+
+      output.at<DT>(output_coord) += input.at<DT>(input_coord);
+    }
+  }
+};
+
+static TensorShape get_reduce_output_shape(TensorShape const &input_shape,
+                                           ff_dim_t reduction_dim) {
+  ASSERT(tensor_dims_has_dim(input_shape.dims, reduction_dim),
+         input_shape.dims,
+         reduction_dim);
+
+  return TensorShape{
+      TensorDims{
+          concat(
+              slice(input_shape.dims.ff_ordered, ff_dim_t{0_n}, reduction_dim),
+              slice(input_shape.dims.ff_ordered,
+                    ff_dim_t{reduction_dim.value + 1_n},
+                    std::nullopt)),
+      },
+      input_shape.data_type,
+  };
+}
+
+void tensor_accessor_reduce_to(GenericTensorAccessorR const &input,
+                               ff_dim_t reduction_dim,
+                               GenericTensorAccessorW const &output) {
+
+  TensorShape output_shape = get_reduce_output_shape(
+      get_tensor_shape_for_accessor_r(input), reduction_dim);
+  ASSERT(get_tensor_shape_for_accessor_r(output) == output_shape);
+
+  Allocator cpu_allocator = create_local_cpu_memory_allocator();
+  GenericTensorAccessorR input_cpu =
+      copy_tensor_accessor_r_to_cpu_if_necessary(input, cpu_allocator);
+
+  GenericTensorAccessorW output_cpu =
+      cpu_allocator.allocate_tensor(output_shape);
+
+  DataTypeDispatch1<CPUTensorAccessorReduce>{}(
+      input.shape.data_type, input_cpu, reduction_dim, output_cpu);
+
+  copy_accessor_data_to_l_from_r(output, output_cpu);
+}
+
+GenericTensorAccessorW
+    tensor_accessor_reduce(GenericTensorAccessorR const &input,
+                           ff_dim_t reduction_dim,
+                           Allocator &output_allocator) {
+
+  TensorShape output_shape = get_reduce_output_shape(
+      get_tensor_shape_for_accessor_r(input), reduction_dim);
+
+  GenericTensorAccessorW output =
+      output_allocator.allocate_tensor(output_shape);
+
+  tensor_accessor_reduce_to(input, reduction_dim, output);
+
+  return output;
+}
+
+} // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/topk_kernels.cc b/lib/kernels/src/kernels/topk_kernels.cc
new file mode 100644
index 0000000000..a3a3c616b3
--- /dev/null
+++ b/lib/kernels/src/kernels/topk_kernels.cc
@@ -0,0 +1,67 @@
+#include "kernels/topk_kernels.h"
+#include "kernels/topk_kernels_cpu.h"
+#include "kernels/topk_kernels_gpu.h"
+#include <libassert/assert.hpp>
+
+namespace FlexFlow::Kernels::TopK {
+
+void forward_kernel(device_stream_t const &stream,
+                    float const *input_ptr,
+                    float *output_ptr,
+                    int *indices_ptr,
+                    size_t batch_size,
+                    int length,
+                    int k,
+                    bool sorted) {
+  if (stream.is_gpu()) {
+    gpu_forward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*input_ptr=*/input_ptr,
+        /*output_ptr=*/output_ptr,
+        /*indices_ptr=*/indices_ptr,
+        /*batch_size=*/batch_size,
+        /*length=*/length,
+        /*k=*/k,
+        /*sorted=*/sorted);
+  } else {
+    ASSERT(stream.is_cpu());
+    cpu_forward_kernel(
+        /*input_ptr=*/input_ptr,
+        /*output_ptr=*/output_ptr,
+        /*indices_ptr=*/indices_ptr,
+        /*batch_size=*/batch_size,
+        /*length=*/length,
+        /*k=*/k,
+        /*sorted=*/sorted);
+  }
+}
+
+void backward_kernel(device_stream_t const &stream,
+                     float const *out_grad_ptr,
+                     int const *indices_ptr,
+                     float *in_grad_ptr,
+                     size_t batch_size,
+                     int length,
+                     int k) {
+  if (stream.is_gpu()) {
+    gpu_backward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*out_grad_ptr=*/out_grad_ptr,
+        /*indices_ptr=*/indices_ptr,
+        /*in_grad_ptr=*/in_grad_ptr,
+        /*batch_size=*/batch_size,
+        /*length=*/length,
+        /*k=*/k);
+  } else {
+    ASSERT(stream.is_cpu());
+    cpu_backward_kernel(
+        /*out_grad_ptr=*/out_grad_ptr,
+        /*indices_ptr=*/indices_ptr,
+        /*in_grad_ptr=*/in_grad_ptr,
+        /*batch_size=*/batch_size,
+        /*length=*/length,
+        /*k=*/k);
+  }
+}
+
+} // namespace FlexFlow::Kernels::TopK
diff --git a/lib/kernels/src/kernels/topk_kernels_cpu.cc b/lib/kernels/src/kernels/topk_kernels_cpu.cc
new file mode 100644
index 0000000000..86ab45f773
--- /dev/null
+++ b/lib/kernels/src/kernels/topk_kernels_cpu.cc
@@ -0,0 +1,25 @@
+#include "kernels/topk_kernels_cpu.h"
+#include "utils/exception.h"
+
+namespace FlexFlow::Kernels::TopK {
+
+void cpu_forward_kernel(float const *input_ptr,
+                        float *output_ptr,
+                        int *indices_ptr,
+                        size_t batch_size,
+                        int length,
+                        int k,
+                        bool sorted) {
+  NOT_IMPLEMENTED();
+}
+
+void cpu_backward_kernel(float const *out_grad_ptr,
+                         int const *indices_ptr,
+                         float *in_grad_ptr,
+                         size_t batch_size,
+                         int length,
+                         int k) {
+  NOT_IMPLEMENTED();
+}
+
+} // namespace FlexFlow::Kernels::TopK
diff --git a/lib/kernels/src/kernels/transpose_kernels.cc b/lib/kernels/src/kernels/transpose_kernels.cc
new file mode 100644
index 0000000000..bb3775a073
--- /dev/null
+++ b/lib/kernels/src/kernels/transpose_kernels.cc
@@ -0,0 +1,45 @@
+#include "kernels/transpose_kernels.h"
+#include "kernels/transpose_kernels_cpu.h"
+#include "kernels/transpose_kernels_gpu.h"
+
+namespace FlexFlow::Kernels::Transpose {
+
+void forward_kernel(device_stream_t const &stream,
+                    TransposeAttrs const &attrs,
+                    GenericTensorAccessorR const &input,
+                    GenericTensorAccessorW const &output) {
+  if (stream.is_gpu()) {
+    gpu_forward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*attrs=*/attrs,
+        /*input=*/input,
+        /*output=*/output);
+  } else {
+    ASSERT(stream.is_cpu());
+    cpu_forward_kernel(
+        /*attrs=*/attrs,
+        /*input=*/input,
+        /*output=*/output);
+  }
+}
+
+void backward_kernel(device_stream_t const &stream,
+                     TransposeAttrs const &attrs,
+                     GenericTensorAccessorR const &out_grad,
+                     GenericTensorAccessorW const &in_grad) {
+  if (stream.is_gpu()) {
+    gpu_backward_kernel(
+        /*stream=*/stream.require_gpu(),
+        /*attrs=*/attrs,
+        /*out_grad=*/out_grad,
+        /*in_grad=*/in_grad);
+  } else {
+    ASSERT(stream.is_cpu());
+    cpu_forward_kernel(
+        /*attrs=*/attrs,
+        /*out_grad=*/out_grad,
+        /*in_grad=*/in_grad);
+  }
+}
+
+} // namespace FlexFlow::Kernels::Transpose
diff --git a/lib/kernels/src/kernels/transpose_kernels_cpu.cc b/lib/kernels/src/kernels/transpose_kernels_cpu.cc
new file mode 100644
index 0000000000..7950e71eb4
--- /dev/null
+++ b/lib/kernels/src/kernels/transpose_kernels_cpu.cc
@@ -0,0 +1,17 @@
+#include "kernels/transpose_kernels_cpu.h"
+
+namespace FlexFlow::Kernels::Transpose {
+
+void cpu_forward_kernel(TransposeAttrs const &attrs,
+                        GenericTensorAccessorR const &input,
+                        GenericTensorAccessorW const &output) {
+  NOT_IMPLEMENTED();
+}
+
+void cpu_backward_kernel(TransposeAttrs const &attrs,
+                         GenericTensorAccessorR const &out_grad,
+                         GenericTensorAccessorW const &in_grad) {
+  NOT_IMPLEMENTED();
+}
+
+} // namespace FlexFlow::Kernels::Transpose
diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc
index 305a6c935c..cccc46d6bf 100644
--- a/lib/kernels/src/managed_per_device_ff_handle.cc
+++ b/lib/kernels/src/managed_per_device_ff_handle.cc
@@ -51,6 +51,19 @@ PerDeviceFFHandle const &ManagedPerDeviceFFHandle::raw_handle() const {
   return *handle;
 }
 
+std::optional<ManagedPerDeviceFFHandle>
+    create_local_handle_for_device_type(DeviceType device_type,
+                                        size_t workSpaceSize,
+                                        bool allowTensorOpMathConversion) {
+  if (device_type == DeviceType::CPU) {
+    return std::nullopt;
+  } else {
+    return initialize_single_gpu_handle(
+        /*workSpaceSize=*/workSpaceSize,
+        /*allowTensorOpMathConversion=*/allowTensorOpMathConversion);
+  }
+}
+
 ManagedPerDeviceFFHandle
     initialize_single_gpu_handle(size_t workSpaceSize,
                                  bool allowTensorOpMathConversion) {
diff --git a/lib/kernels/test/src/cpu/ops/replicate_kernels.cc b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc
deleted file mode 100644
index 1984fd5f83..0000000000
--- a/lib/kernels/test/src/cpu/ops/replicate_kernels.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-#include "kernels/test_utils.h"
-#include "kernels/create_accessor_with_contents.h"
-#include "kernels/format_accessor_contents.h"
-#include "kernels/replicate_kernels_cpu.h"
-#include "test/utils/doctest/check_kv.h"
-#include <doctest/doctest.h>
-
-using namespace ::FlexFlow;
-
-TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("Replicate::cpu_forward_kernel") {
-    Allocator cpu_allocator = create_local_cpu_memory_allocator();
-
-    GenericTensorAccessorR input =
-        create_1d_accessor_r_with_contents<int32_t>({1, 3, 2}, cpu_allocator);
-
-    TensorShape result_shape = TensorShape{
-        TensorDims{FFOrdered{3_p}},
-        DataType::INT32,
-    };
-    GenericTensorAccessorW result =
-        create_zero_filled_accessor_w(result_shape, cpu_allocator);
-
-    GenericTensorAccessorR correct = input;
-
-    Kernels::Replicate::cpu_forward_kernel(input, result);
-
-    CHECK_MESSAGE(accessors_are_equal(result, correct),
-                  "result=",
-                  format_accessor_w_contents(result));
-  }
-
-  TEST_CASE("Replicate::cpu_backward_kernel") {
-    Allocator cpu_allocator = create_local_cpu_memory_allocator();
-
-    GenericTensorAccessorR output = create_2d_accessor_r_with_contents<int32_t>(
-        {
-            {1, 2, 3},
-            {4, 3, 3},
-            {1, 3, 5},
-        },
-        cpu_allocator);
-
-    GenericTensorAccessorR correct =
-        create_1d_accessor_r_with_contents<int32_t>(
-            {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator);
-
-    TensorShape result_shape = TensorShape{
-        TensorDims{FFOrdered{3_p}},
-        DataType::INT32,
-    };
-    GenericTensorAccessorW result =
-        create_zero_filled_accessor_w(result_shape, cpu_allocator);
-    Kernels::Replicate::cpu_backward_kernel(output, result, 3);
-
-    CHECK_MESSAGE(accessors_are_equal(result, correct),
-                  check_kv("result", format_accessor_w_contents(result)));
-  }
-}
diff --git a/lib/kernels/src/test_utils.cc b/lib/kernels/test/src/internal/test_utils.cc
similarity index 85%
rename from lib/kernels/src/test_utils.cc
rename to lib/kernels/test/src/internal/test_utils.cc
index 67f2fb624a..b30c656f9a 100644
--- a/lib/kernels/src/test_utils.cc
+++ b/lib/kernels/test/src/internal/test_utils.cc
@@ -1,4 +1,5 @@
-#include "kernels/test_utils.h"
+#include "internal/test_utils.h"
+#include "kernels/fill_tensor_accessor.h"
 #include "op-attrs/tensor_shape.h"
 #include "utils/containers/require_all_same1.h"
 #include "utils/join_strings.h"
@@ -32,7 +33,7 @@ struct CreateRandomFilledAccessorW {
 
     std::random_device rd;
     std::mt19937 gen(rd());
-    size_t num_elements = get_num_elements(shape).int_from_positive_int();
+    size_t num_elements = get_num_elements(shape.dims).int_from_positive_int();
     if constexpr (std::is_same<T, bool>::value) {
       std::bernoulli_distribution dist(0.5);
       for (size_t i = 0; i < num_elements; i++) {
@@ -71,28 +72,6 @@ GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape,
   return read_only_accessor_from_write_accessor(accessor);
 }
 
-template <DataType DT>
-struct FillWithZeros {
-  void operator()(GenericTensorAccessorW const &accessor) {
-    using T = real_type_t<DT>;
-
-    if (accessor.device_type == DeviceType::CPU) {
-      memset(accessor.ptr,
-             0,
-             accessor.shape.num_elements().int_from_positive_int() * sizeof(T));
-    } else {
-      checkCUDA(cudaMemset(
-          accessor.ptr,
-          0,
-          accessor.shape.num_elements().int_from_positive_int() * sizeof(T)));
-    }
-  }
-};
-
-void fill_with_zeros(GenericTensorAccessorW const &accessor) {
-  DataTypeDispatch1<FillWithZeros>{}(accessor.data_type, accessor);
-}
-
 template <DataType DT>
 struct CPUAccessorRContainsNonZero {
   bool operator()(GenericTensorAccessorR const &accessor) {
@@ -100,7 +79,7 @@ struct CPUAccessorRContainsNonZero {
 
     T const *data_ptr = accessor.get<DT>();
 
-    int volume = accessor.shape.num_elements().int_from_positive_int();
+    int volume = get_num_elements(accessor.shape.dims).int_from_positive_int();
     for (size_t i = 0; i < volume; i++) {
       if (data_ptr[i] != 0) {
         return true;
@@ -116,7 +95,7 @@ bool contains_non_zero(GenericTensorAccessorR const &accessor) {
   GenericTensorAccessorR cpu_accessor =
       copy_tensor_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator);
   return DataTypeDispatch1<CPUAccessorRContainsNonZero>{}(
-      cpu_accessor.data_type, cpu_accessor);
+      cpu_accessor.shape.data_type, cpu_accessor);
 }
 
 template <DataType DT>
@@ -133,7 +112,8 @@ struct AccessorsAreEqual {
     T const *a_data_ptr = cpu_accessor_a.get<DT>();
     T const *b_data_ptr = cpu_accessor_b.get<DT>();
 
-    int volume = accessor_a.shape.num_elements().int_from_positive_int();
+    int volume =
+        get_num_elements(accessor_a.shape.dims).int_from_positive_int();
     for (size_t i = 0; i < volume; i++) {
       if (a_data_ptr[i] != b_data_ptr[i]) {
         return false;
@@ -150,7 +130,7 @@ bool accessors_are_equal(GenericTensorAccessorR const &accessor_a,
          "accessors_are_equal expects accessors to have the same shape");
 
   return DataTypeDispatch1<AccessorsAreEqual>{}(
-      accessor_a.data_type, accessor_a, accessor_b);
+      accessor_a.shape.data_type, accessor_a, accessor_b);
 }
 
 template <DataType DT>
@@ -171,7 +151,8 @@ struct CreateFilledAccessorW {
 
     T *data_ptr = src_accessor.get<DT>();
 
-    int volume = dst_accessor.shape.num_elements().int_from_positive_int();
+    int volume =
+        get_num_elements(dst_accessor.shape.dims).int_from_positive_int();
     for (size_t i = 0; i < volume; i++) {
       data_ptr[i] = unwrapped_value;
     }
diff --git a/lib/kernels/include/kernels/test_utils.h b/lib/kernels/test/src/internal/test_utils.h
similarity index 96%
rename from lib/kernels/include/kernels/test_utils.h
rename to lib/kernels/test/src/internal/test_utils.h
index 9147b667d6..3a2c9b773c 100644
--- a/lib/kernels/include/kernels/test_utils.h
+++ b/lib/kernels/test/src/internal/test_utils.h
@@ -31,8 +31,6 @@ GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape,
 
 bool contains_non_zero(GenericTensorAccessorR const &accessor);
 
-void fill_with_zeros(GenericTensorAccessorW const &accessor);
-
 void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor,
                                        std::ostream &stream);
 
diff --git a/lib/kernels/test/src/kernels/accessor.cc b/lib/kernels/test/src/kernels/accessor.cc
index 31a6cba205..b5daf80011 100644
--- a/lib/kernels/test/src/kernels/accessor.cc
+++ b/lib/kernels/test/src/kernels/accessor.cc
@@ -1,5 +1,5 @@
 #include "kernels/accessor.h"
-#include "kernels/test_utils.h"
+#include "internal/test_utils.h"
 #include "kernels/create_accessor_with_contents.h"
 #include "kernels/local_cpu_allocator.h"
 #include <doctest/doctest.h>
@@ -9,9 +9,9 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("calculate_accessor_offset") {
     SUBCASE("one dimension") {
-      std::vector<nonnegative_int> indices = {4_n};
-      ArrayShape shape = ArrayShape{
-          std::vector{
+      TensorDimsCoord indices = TensorDimsCoord{FFOrdered{4_n}};
+      TensorDims shape = TensorDims{
+          FFOrdered{
               13_p,
           },
       };
@@ -22,24 +22,43 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(result == correct);
     }
 
+    SUBCASE("2d tensor is row-major") {
+      positive_int num_rows = 5_p;
+      positive_int num_cols = 6_p;
+
+      TensorDims shape = TensorDims{
+          FFOrdered{
+              num_rows,
+              num_cols,
+          },
+      };
+
+      CHECK(calculate_accessor_offset(TensorDimsCoord{FFOrdered{0_n, 0_n}},
+                                      shape) == 0_n);
+      CHECK(calculate_accessor_offset(TensorDimsCoord{FFOrdered{1_n, 0_n}},
+                                      shape) == num_cols);
+      CHECK(calculate_accessor_offset(TensorDimsCoord{FFOrdered{0_n, 1_n}},
+                                      shape) == 1_p);
+    }
+
     SUBCASE("multiple dimensions") {
-      std::vector<nonnegative_int> indices = {2_n, 4_n};
-      ArrayShape shape = ArrayShape{
-          std::vector{
-              6_p,
+      TensorDimsCoord indices = TensorDimsCoord{FFOrdered{2_n, 4_n}};
+      TensorDims shape = TensorDims{
+          FFOrdered{
               5_p,
+              6_p,
           },
       };
 
       nonnegative_int result = calculate_accessor_offset(indices, shape);
-      nonnegative_int correct = 2_n * 5_n + 4_n;
+      nonnegative_int correct = 2_n * 6_n + 4_n;
 
       CHECK(result == correct);
     }
 
     SUBCASE("zero dimensions") {
-      std::vector<nonnegative_int> indices = {};
-      ArrayShape shape = ArrayShape{std::vector<positive_int>{}};
+      TensorDimsCoord indices = TensorDimsCoord{FFOrdered<nonnegative_int>{}};
+      TensorDims shape = TensorDims{FFOrdered<positive_int>{}};
 
       nonnegative_int result = calculate_accessor_offset(indices, shape);
       nonnegative_int correct = 0_n;
@@ -48,11 +67,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("index and shape dimensions do not match") {
-      std::vector<nonnegative_int> indices = {1_n, 2_n, 4_n};
-      ArrayShape shape = ArrayShape{
-          std::vector<positive_int>{
-              6_p,
+      TensorDimsCoord indices = TensorDimsCoord{FFOrdered{1_n, 2_n, 4_n}};
+      TensorDims shape = TensorDims{
+          FFOrdered{
               5_p,
+              6_p,
           },
       };
 
@@ -60,11 +79,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("out of bounds index") {
-      std::vector<nonnegative_int> indices = {2_n, 5_n};
-      ArrayShape shape = ArrayShape{
-          std::vector<positive_int>{
-              6_p,
+      TensorDimsCoord indices = TensorDimsCoord{FFOrdered{2_n, 6_n}};
+      TensorDims shape = TensorDims{
+          FFOrdered{
               5_p,
+              6_p,
           },
       };
 
diff --git a/lib/kernels/test/src/kernels/array_shape.cc b/lib/kernels/test/src/kernels/array_shape.cc
deleted file mode 100644
index b3ccbc688c..0000000000
--- a/lib/kernels/test/src/kernels/array_shape.cc
+++ /dev/null
@@ -1,87 +0,0 @@
-#include "kernels/array_shape.h"
-#include "test/utils/doctest/fmt/unordered_set.h"
-#include <doctest/doctest.h>
-
-using namespace ::FlexFlow;
-
-TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("get_array_coord_set") {
-    SUBCASE("ArrayShape is not empty") {
-      ArrayShape input = ArrayShape{
-          LegionOrdered{2_p, 1_p, 3_p},
-      };
-
-      std::unordered_set<ArrayCoord> result = get_array_coord_set(input);
-      std::unordered_set<ArrayCoord> correct = {
-          ArrayCoord{FFOrdered{0_n, 0_n, 0_n}},
-          ArrayCoord{FFOrdered{0_n, 0_n, 1_n}},
-          ArrayCoord{FFOrdered{1_n, 0_n, 0_n}},
-          ArrayCoord{FFOrdered{1_n, 0_n, 1_n}},
-          ArrayCoord{FFOrdered{2_n, 0_n, 0_n}},
-          ArrayCoord{FFOrdered{2_n, 0_n, 1_n}},
-      };
-
-      CHECK(result == correct);
-    }
-
-    SUBCASE("ArrayShape is zero-dimensional") {
-      ArrayShape input = ArrayShape{LegionOrdered<positive_int>{}};
-
-      std::unordered_set<ArrayCoord> result = get_array_coord_set(input);
-      std::unordered_set<ArrayCoord> correct = {
-          ArrayCoord{FFOrdered<nonnegative_int>{}},
-      };
-
-      CHECK(result == correct);
-    }
-  }
-
-  TEST_CASE("array_shape_drop_dims") {
-    ArrayShape input = ArrayShape{
-        LegionOrdered{2_p, 4_p, 3_p},
-    };
-
-    SUBCASE("removes dims specified to be dropped") {
-      auto should_drop_dim = [](ff_dim_t dim) -> bool {
-        return dim.value % 2_n == 0;
-      };
-
-      ArrayShape result = array_shape_drop_dims(input, should_drop_dim);
-      ArrayShape correct = ArrayShape{
-          LegionOrdered{4_p},
-      };
-
-      CHECK(result == correct);
-    }
-
-    SUBCASE(
-        "is identity function if no dimensions are specified to be dropped") {
-      auto should_drop_dim = [](ff_dim_t dim) -> bool { return false; };
-
-      ArrayShape result = array_shape_drop_dims(input, should_drop_dim);
-      ArrayShape correct = input;
-
-      CHECK(result == correct);
-    }
-
-    SUBCASE(
-        "is identity function if no dimensions are specified to be dropped") {
-      auto should_drop_dim = [](ff_dim_t dim) -> bool { return false; };
-
-      ArrayShape result = array_shape_drop_dims(input, should_drop_dim);
-      ArrayShape correct = input;
-
-      CHECK(result == correct);
-    }
-
-    SUBCASE(
-        "returns empty shape if all dimensions are specified to be dropped") {
-      auto should_drop_dim = [](ff_dim_t dim) -> bool { return true; };
-
-      ArrayShape result = array_shape_drop_dims(input, should_drop_dim);
-      ArrayShape correct = ArrayShape{LegionOrdered<positive_int>{}};
-
-      CHECK(result == correct);
-    }
-  }
-}
diff --git a/lib/kernels/test/src/kernels/compare_tensor_accessors.cc b/lib/kernels/test/src/kernels/compare_tensor_accessors.cc
index 4e85dfdaa0..85ffa91315 100644
--- a/lib/kernels/test/src/kernels/compare_tensor_accessors.cc
+++ b/lib/kernels/test/src/kernels/compare_tensor_accessors.cc
@@ -1,5 +1,5 @@
 #include "kernels/compare_tensor_accessors.h"
-#include "kernels/test_utils.h"
+#include "internal/test_utils.h"
 #include "kernels/create_accessor_with_contents.h"
 #include "kernels/format_accessor_contents.h"
 #include "test/utils/doctest/check_kv.h"
diff --git a/lib/kernels/test/src/kernels/create_accessor_with_contents.cc b/lib/kernels/test/src/kernels/create_accessor_with_contents.cc
index 69fa2728bf..3f073f2697 100644
--- a/lib/kernels/test/src/kernels/create_accessor_with_contents.cc
+++ b/lib/kernels/test/src/kernels/create_accessor_with_contents.cc
@@ -11,7 +11,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         create_1d_accessor_w_with_contents<float>({1, 4, 1, 2}, cpu_allocator);
 
     auto at = [&](nonnegative_int c) -> float {
-      return result.at<DataType::FLOAT>(FFOrdered<nonnegative_int>{c});
+      return result.at<DataType::FLOAT>(TensorDimsCoord{FFOrdered{c}});
     };
 
     CHECK(at(0_n) == 1);
@@ -31,7 +31,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         cpu_allocator);
 
     auto at = [&](nonnegative_int r, nonnegative_int c) -> float {
-      return result.at<DataType::FLOAT>(FFOrdered{r, c});
+      return result.at<DataType::FLOAT>(TensorDimsCoord{FFOrdered{r, c}});
     };
 
     CHECK(at(0_n, 0_n) == 1);
@@ -62,7 +62,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto at =
         [&](nonnegative_int s, nonnegative_int r, nonnegative_int c) -> float {
-      return result.at<DataType::FLOAT>(FFOrdered{s, r, c});
+      return result.at<DataType::FLOAT>(TensorDimsCoord{FFOrdered{s, r, c}});
     };
 
     CHECK(at(0_n, 0_n, 0_n) == 1);
@@ -111,7 +111,8 @@ TEST_SUITE(FF_TEST_SUITE) {
                   nonnegative_int s2,
                   nonnegative_int r,
                   nonnegative_int c) -> float {
-      return result.at<DataType::FLOAT>(FFOrdered{s1, s2, r, c});
+      return result.at<DataType::FLOAT>(
+          TensorDimsCoord{FFOrdered{s1, s2, r, c}});
     };
 
     CHECK(at(0_n, 0_n, 0_n, 0_n) == 2);
diff --git a/lib/kernels/test/src/kernels/format_accessor_contents.cc b/lib/kernels/test/src/kernels/format_accessor_contents.cc
index a2b61b8dff..b4af5c9148 100644
--- a/lib/kernels/test/src/kernels/format_accessor_contents.cc
+++ b/lib/kernels/test/src/kernels/format_accessor_contents.cc
@@ -1,5 +1,5 @@
 #include "kernels/format_accessor_contents.h"
-#include "kernels/test_utils.h"
+#include "internal/test_utils.h"
 #include "kernels/create_accessor_with_contents.h"
 #include "kernels/local_cpu_allocator.h"
 #include <doctest/doctest.h>
@@ -88,11 +88,67 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(result == correct);
     }
 
-    SUBCASE("accessor is some other dimension") {
+    SUBCASE("accessor is 4d") {
       GenericTensorAccessorR accessor =
-          create_4d_accessor_r_with_contents<int32_t>({{{{5}}}}, cpu_allocator);
+          create_4d_accessor_r_with_contents<int32_t>(
+              {
+                  {
+                      {
+                          {2, 1, 1, 9},
+                          {1, 3, 6, 2},
+                          {1, 9, 8, 9},
+                      },
+                      {
+                          {9, 2, 7, 6},
+                          {7, 2, 1, 1},
+                          {2, 8, 5, 6},
+                      },
+                  },
+                  {
+                      {
+                          {1, 2, 3, 6},
+                          {4, 3, 3, 9},
+                          {1, 1, 5, 1},
+                      },
+                      {
+                          {4, 1, 8, 7},
+                          {9, 4, 2, 4},
+                          {1, 0, 0, 6},
+                      },
+                  },
+              },
+              cpu_allocator);
 
-      CHECK_THROWS(format_accessor_r_contents(accessor));
+      std::string correct = "[\n"
+                            "  [\n"
+                            "    [\n"
+                            "      [2 1 1 9]\n"
+                            "      [1 3 6 2]\n"
+                            "      [1 9 8 9]\n"
+                            "    ]\n"
+                            "    [\n"
+                            "      [9 2 7 6]\n"
+                            "      [7 2 1 1]\n"
+                            "      [2 8 5 6]\n"
+                            "    ]\n"
+                            "  ]\n"
+                            "  [\n"
+                            "    [\n"
+                            "      [1 2 3 6]\n"
+                            "      [4 3 3 9]\n"
+                            "      [1 1 5 1]\n"
+                            "    ]\n"
+                            "    [\n"
+                            "      [4 1 8 7]\n"
+                            "      [9 4 2 4]\n"
+                            "      [1 0 0 6]\n"
+                            "    ]\n"
+                            "  ]\n"
+                            "]";
+
+      std::string result = format_accessor_r_contents(accessor);
+
+      CHECK(result == correct);
     }
   }
 }
diff --git a/lib/kernels/test/src/kernels/legion_dim.cc b/lib/kernels/test/src/kernels/legion_dim.cc
index 34822ed1c3..23401ffebe 100644
--- a/lib/kernels/test/src/kernels/legion_dim.cc
+++ b/lib/kernels/test/src/kernels/legion_dim.cc
@@ -7,7 +7,7 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("key_range(LegionOrdered<T>)") {
     SUBCASE("input is non-empty") {
-      LegionOrdered<int> input = {5, 3, 2, 3};
+      LegionOrdered<int> input = LegionOrdered{5, 3, 2, 3};
 
       std::set<legion_dim_t> result = key_range(input);
       std::set<legion_dim_t> correct = {
diff --git a/lib/kernels/test/src/kernels/legion_ordered/transform.cc b/lib/kernels/test/src/kernels/legion_ordered/transform.cc
index 759507264f..e1846bd01a 100644
--- a/lib/kernels/test/src/kernels/legion_ordered/transform.cc
+++ b/lib/kernels/test/src/kernels/legion_ordered/transform.cc
@@ -19,7 +19,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     }
 
     SUBCASE("input is not empty") {
-      LegionOrdered<int> input = {2, 1, 2, 5};
+      LegionOrdered<int> input = LegionOrdered{2, 1, 2, 5};
 
       LegionOrdered<std::string> result =
           transform(input, [](int x) { return fmt::to_string(x); });
diff --git a/lib/kernels/test/src/kernels/linear_kernels.cc b/lib/kernels/test/src/kernels/linear_kernels.cc
new file mode 100644
index 0000000000..423e6be4f1
--- /dev/null
+++ b/lib/kernels/test/src/kernels/linear_kernels.cc
@@ -0,0 +1,263 @@
+#include "kernels/linear_kernels.h"
+#include "internal/test_utils.h"
+#include "kernels/copy_tensor_accessor.h"
+#include "kernels/create_accessor_with_contents.h"
+#include "kernels/create_local_allocator_for_device_type.h"
+#include "kernels/device_handle_t.h"
+#include "kernels/device_stream_t.h"
+#include "kernels/format_accessor_contents.h"
+#include "kernels/managed_per_device_ff_handle.h"
+#include "test/utils/doctest/check_kv.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("linear_forward_kernel cpu-gpu alignment") {
+    Allocator local_cpu_allocator = create_local_cpu_memory_allocator();
+
+    // GenericTensorAccessorR toy_input =
+    // create_2d_accessor_r_with_contents<float>(
+    //     {
+    //         {3, 3, 6},
+    //         {2, 1, 5},
+    //         {1, 2, -2},
+    //         {8, 0.5, -3},
+    //     },
+    //     local_cpu_allocator);
+    // float const *toy_arr = toy_input.get_float_ptr();
+    // std::cout << toy_arr[0] << "  "
+    //           << toy_arr[1] << "  "
+    //           << toy_arr[2] << std::endl;
+    //
+    // Allocator local_cuda_allocator = create_local_cuda_memory_allocator();
+    // GenericTensorAccessorW toy_cuda =
+    // local_cuda_allocator.allocate_tensor(toy_input.shape);
+    // copy_accessor_data_to_l_from_r(toy_cuda, toy_input);
+    // GenericTensorAccessorW toy_input2 =
+    // local_cpu_allocator.allocate_tensor(toy_input.shape);
+    // copy_accessor_data_to_l_from_r(toy_input2,
+    // read_only_accessor_from_write_accessor(toy_cuda)); CHECK_MESSAGE(
+    //     accessors_are_equal(toy_input, toy_input2),
+    //     check_kv("cpu_result", format_accessor_r_contents(toy_input)),
+    //     check_kv("gpu_result", format_accessor_w_contents(toy_input2)));
+
+    auto run_forward_kernel = [&](DeviceType device_type) {
+      Allocator allocator = create_local_allocator_for_device_type(device_type);
+
+      GenericTensorAccessorR input = create_2d_accessor_r_with_contents<float>(
+          {
+              {3, 3, 6},
+              {2, 1, 5},
+              {1, 2, -2},
+              {8, 0.5, -3},
+          },
+          allocator);
+
+      GenericTensorAccessorR projection =
+          create_2d_accessor_r_with_contents<float>(
+              {
+                  {1.0f, 2.0f, 1.5f},
+                  {0.5f, 4.0f, -1.0f},
+              },
+              allocator);
+
+      GenericTensorAccessorR bias =
+          create_1d_accessor_r_with_contents<float>({3.0, -1.0}, allocator);
+
+      int batch_size = 4;
+      positive_int output_num_channels = 2_p;
+
+      TensorShape output_shape = TensorShape{
+          TensorDims{FFOrdered{positive_int{batch_size}, output_num_channels}},
+          DataType::FLOAT,
+      };
+
+      GenericTensorAccessorW output = allocator.allocate_tensor(output_shape);
+
+      std::optional<ManagedPerDeviceFFHandle> managed_handle =
+          create_local_handle_for_device_type(
+              device_type,
+              /*workSpaceSize=*/1024 * 1024,
+              /*allowTensorOpMathConversion=*/true);
+
+      LinearAttrs attrs = LinearAttrs{
+          /*out_channels=*/output_num_channels,
+          /*use_bias=*/true,
+          /*data_type=*/DataType::FLOAT,
+          /*activation=*/std::nullopt,
+          /*regularizer=*/std::nullopt,
+      };
+
+      std::optional<LinearPerDeviceState> per_device_state = linear_init_kernel(
+          /*device_type=*/device_type,
+          /*handle=*/device_handle_t_from_managed_handle(managed_handle),
+          /*activation=*/attrs.activation,
+          /*regularizer=*/attrs.regularizer,
+          /*use_bias=*/attrs.use_bias,
+          /*input_type=*/DataType::FLOAT,
+          /*weight_type=*/DataType::FLOAT,
+          /*output_type=*/DataType::FLOAT,
+          /*batch_size=*/batch_size,
+          /*output_num_channels=*/attrs.out_channels.int_from_positive_int());
+
+      device_stream_t stream = get_stream_for_device_type(device_type);
+
+      linear_forward_kernel(
+          /*stream=*/stream,
+          /*per_device_state=*/per_device_state,
+          /*attrs=*/attrs,
+          /*input_accessor=*/input,
+          /*output_accessor=*/output,
+          /*projection_accessor=*/projection,
+          /*bias_accessor=*/bias);
+
+      return copy_tensor_accessor_w(output, local_cpu_allocator);
+    };
+
+    GenericTensorAccessorW cpu_result = run_forward_kernel(DeviceType::CPU);
+    GenericTensorAccessorW gpu_result = run_forward_kernel(DeviceType::GPU);
+
+    CHECK_MESSAGE(
+        accessors_are_equal(cpu_result, gpu_result),
+        check_kv("cpu_result", format_accessor_w_contents(cpu_result)),
+        check_kv("gpu_result", format_accessor_w_contents(gpu_result)));
+  }
+
+  TEST_CASE("backward_kernel CPU/GPU alignment (Linear)") {
+    Allocator local_cpu_allocator = create_local_cpu_memory_allocator();
+
+    auto run_forward_kernel = [&](DeviceType device_type) {
+      Allocator allocator = create_local_allocator_for_device_type(device_type);
+
+      GenericTensorAccessorR input = create_2d_accessor_r_with_contents<float>(
+          {
+              {3, 3, 6},
+              {2, 1, 5},
+              {1, 2, -2},
+              {8, 0.5, -3},
+          },
+          allocator);
+
+      GenericTensorAccessorW input_grad = create_zero_filled_accessor_w(
+          get_tensor_shape_for_accessor_r(input), allocator);
+
+      GenericTensorAccessorR projection =
+          create_2d_accessor_r_with_contents<float>(
+              {
+                  {1.0f, 2.0f, 1.5f},
+                  {0.5f, 4.0f, -1.0f},
+              },
+              allocator);
+
+      GenericTensorAccessorW projection_grad = create_zero_filled_accessor_w(
+          get_tensor_shape_for_accessor_r(projection), allocator);
+
+      GenericTensorAccessorR bias =
+          create_1d_accessor_r_with_contents<float>({3.0, -1.0}, allocator);
+
+      GenericTensorAccessorW bias_grad = create_zero_filled_accessor_w(
+          get_tensor_shape_for_accessor_r(bias), allocator);
+
+      GenericTensorAccessorR output = create_2d_accessor_r_with_contents<float>(
+          {
+              {21.0f, 6.5f},
+              {14.5f, -1.0f},
+              {5.0f, 9.5f},
+              {7.5f, 8.0f},
+          },
+          allocator);
+
+      GenericTensorAccessorR output_grad =
+          create_2d_accessor_r_with_contents<float>(
+              {
+                  {1.0f, -0.5f},
+                  {2.0f, -2.0f},
+                  {1.0f, 9.0f},
+                  {-3.5f, 1.0f},
+              },
+              allocator);
+
+      int batch_size = 4;
+      positive_int output_num_channels = 2_p;
+
+      LinearAttrs attrs = LinearAttrs{
+          /*out_channels=*/output_num_channels,
+          /*use_bias=*/true,
+          /*data_type=*/DataType::FLOAT,
+          /*activation=*/std::nullopt,
+          /*regularizer=*/std::nullopt,
+      };
+
+      TensorShape output_shape = TensorShape{
+          TensorDims{FFOrdered{positive_int{batch_size},
+                               positive_int{output_num_channels}}},
+          DataType::FLOAT,
+      };
+
+      std::optional<ManagedPerDeviceFFHandle> managed_handle =
+          create_local_handle_for_device_type(
+              device_type,
+              /*workSpaceSize=*/1024 * 1024,
+              /*allowTensorOpMathConversion=*/true);
+
+      std::optional<LinearPerDeviceState> per_device_state = linear_init_kernel(
+          /*device_type=*/device_type,
+          /*handle=*/device_handle_t_from_managed_handle(managed_handle),
+          /*activation=*/attrs.activation,
+          /*regularizer=*/attrs.regularizer,
+          /*use_bias=*/true,
+          /*input_type=*/DataType::FLOAT,
+          /*weight_type=*/DataType::FLOAT,
+          /*output_type=*/DataType::FLOAT,
+          /*batch_size=*/batch_size,
+          /*output_num_channels=*/attrs.out_channels.int_from_positive_int());
+
+      device_stream_t stream = get_stream_for_device_type(device_type);
+
+      linear_backward_kernel(
+          /*stream=*/stream,
+          /*per_device_state=*/per_device_state,
+          /*attrs=*/attrs,
+          /*output=*/output,
+          /*output_grad=*/output_grad,
+          /*input=*/input,
+          /*input_grad=*/input_grad,
+          /*projection=*/projection,
+          /*projection_grad=*/projection_grad,
+          /*bias_grad=*/bias_grad);
+
+      return std::tuple{
+          copy_tensor_accessor_w(input_grad, local_cpu_allocator),
+          copy_tensor_accessor_w(projection_grad, local_cpu_allocator),
+          copy_tensor_accessor_w(bias_grad, local_cpu_allocator),
+      };
+    };
+
+    auto cpu_results = run_forward_kernel(DeviceType::CPU);
+    GenericTensorAccessorW cpu_input_grad = std::get<0>(cpu_results);
+    GenericTensorAccessorW cpu_projection_grad = std::get<1>(cpu_results);
+    GenericTensorAccessorW cpu_bias_grad = std::get<2>(cpu_results);
+
+    auto gpu_results = run_forward_kernel(DeviceType::GPU);
+    GenericTensorAccessorW gpu_input_grad = std::get<0>(gpu_results);
+    GenericTensorAccessorW gpu_projection_grad = std::get<1>(gpu_results);
+    GenericTensorAccessorW gpu_bias_grad = std::get<2>(gpu_results);
+
+    CHECK_MESSAGE(
+        accessors_are_equal(cpu_input_grad, gpu_input_grad),
+        check_kv("cpu_input_grad", format_accessor_w_contents(cpu_input_grad)),
+        check_kv("gpu_input_grad", format_accessor_w_contents(gpu_input_grad)));
+
+    CHECK_MESSAGE(accessors_are_equal(cpu_projection_grad, gpu_projection_grad),
+                  check_kv("cpu_projection_grad",
+                           format_accessor_w_contents(cpu_projection_grad)),
+                  check_kv("gpu_projection_grad",
+                           format_accessor_w_contents(gpu_projection_grad)));
+
+    CHECK_MESSAGE(
+        accessors_are_equal(cpu_bias_grad, gpu_bias_grad),
+        check_kv("cpu_bias_grad", format_accessor_w_contents(cpu_bias_grad)),
+        check_kv("gpu_bias_grad", format_accessor_w_contents(gpu_bias_grad)));
+  }
+}
diff --git a/lib/kernels/test/src/kernels/linear_kernels_cpu.cc b/lib/kernels/test/src/kernels/linear_kernels_cpu.cc
new file mode 100644
index 0000000000..0586fd7d1f
--- /dev/null
+++ b/lib/kernels/test/src/kernels/linear_kernels_cpu.cc
@@ -0,0 +1,175 @@
+#include "kernels/linear_kernels_cpu.h"
+#include "internal/test_utils.h"
+#include "kernels/create_accessor_with_contents.h"
+#include "kernels/format_accessor_contents.h"
+#include "test/utils/doctest/check_kv.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("linear_cpu_forward_kernel") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    LinearAttrs attrs = LinearAttrs{
+        /*out_channels=*/2_p,
+        /*use_bias=*/true,
+        /*data_type=*/DataType::FLOAT,
+        /*activation=*/std::nullopt,
+        /*regularizer=*/std::nullopt,
+    };
+
+    GenericTensorAccessorR input = create_2d_accessor_r_with_contents<float>(
+        {
+            {3, 3, 6},
+            {2, 1, 5},
+            {1, 2, -2},
+            {8, 0.5, -3},
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorR projection =
+        create_2d_accessor_r_with_contents<float>(
+            {
+                {1.0f, 2.0f, 1.5f},
+                {0.5f, 4.0f, -1.0f},
+            },
+            cpu_allocator);
+
+    GenericTensorAccessorR bias =
+        create_1d_accessor_r_with_contents<float>({3.0, -1.0}, cpu_allocator);
+
+    GenericTensorAccessorW result = create_zero_filled_accessor_w(
+        TensorShape{
+            TensorDims{FFOrdered{4_p, attrs.out_channels}},
+            DataType::FLOAT,
+        },
+        cpu_allocator);
+
+    linear_cpu_forward_kernel(
+        /*attrs=*/attrs,
+        /*input=*/input,
+        /*output=*/result,
+        /*projection=*/projection,
+        /*bias=*/bias);
+
+    GenericTensorAccessorR correct = create_2d_accessor_r_with_contents<float>(
+        {
+            {21.0f, 6.5f},
+            {14.5f, -1.0f},
+            {5.0f, 9.5f},
+            {7.5f, 8.0f},
+        },
+        cpu_allocator);
+
+    CHECK_MESSAGE(accessors_are_equal(result, correct),
+                  check_kv("result", format_accessor_w_contents(result)));
+  }
+
+  TEST_CASE("linear_cpu_backward_kernel") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    LinearAttrs attrs = LinearAttrs{
+        /*out_channels=*/2_p,
+        /*use_bias=*/true,
+        /*data_type=*/DataType::FLOAT,
+        /*activation=*/std::nullopt,
+        /*regularizer=*/std::nullopt,
+    };
+
+    GenericTensorAccessorR input = create_2d_accessor_r_with_contents<float>(
+        {
+            {3, 3, 6},
+            {2, 1, 5},
+            {1, 2, -2},
+            {8, 0.5, -3},
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorW input_grad =
+        cpu_allocator.allocate_tensor(get_tensor_shape_for_accessor_r(input));
+
+    GenericTensorAccessorR projection =
+        create_2d_accessor_r_with_contents<float>(
+            {
+                {1.0f, 2.0f, 1.5f},
+                {0.5f, 4.0f, -1.0f},
+            },
+            cpu_allocator);
+
+    GenericTensorAccessorW projection_grad = cpu_allocator.allocate_tensor(
+        get_tensor_shape_for_accessor_r(projection));
+
+    GenericTensorAccessorR bias =
+        create_1d_accessor_r_with_contents<float>({3.0, -1.0}, cpu_allocator);
+
+    GenericTensorAccessorW bias_grad =
+        cpu_allocator.allocate_tensor(get_tensor_shape_for_accessor_r(bias));
+
+    GenericTensorAccessorR output = create_2d_accessor_r_with_contents<float>(
+        {
+            {21.0f, 6.5f},
+            {14.5f, -1.0f},
+            {5.0f, 9.5f},
+            {7.5f, 8.0f},
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorR output_grad =
+        create_2d_accessor_r_with_contents<float>(
+            {
+                {1.0f, -0.5f},
+                {2.0f, -2.0f},
+                {1.0f, 9.0f},
+                {-3.5f, 1.0f},
+            },
+            cpu_allocator);
+
+    linear_cpu_backward_kernel(
+        /*attrs=*/attrs,
+        /*output=*/output,
+        /*output_grad=*/output_grad,
+        /*input=*/input,
+        /*input_grad=*/input_grad,
+        /*projection=*/projection,
+        /*projection_grad=*/projection_grad,
+        /*bias_grad=*/bias_grad);
+
+    GenericTensorAccessorR correct_input_grad =
+        create_2d_accessor_r_with_contents<float>(
+            {
+                {0.75f, 0.0f, 2.0f},
+                {1.0f, -4.0f, 5.0f},
+                {5.5f, 38.0f, -7.5f},
+                {-3.0f, -3.0f, -6.25f},
+            },
+            cpu_allocator);
+
+    GenericTensorAccessorR correct_projection_grad =
+        create_2d_accessor_r_with_contents<float>(
+            {
+                {-20.0f, 5.25f, 24.5f},
+                {11.5f, 15.0f, -34.0f},
+            },
+            cpu_allocator);
+
+    GenericTensorAccessorR correct_bias_grad =
+        create_1d_accessor_r_with_contents<float>(
+            {
+                1.0f + 2.0f + 1.0f + -3.5f,
+                -0.5f + -2.0f + 9.0f + 1.0f,
+            },
+            cpu_allocator);
+
+    CHECK_MESSAGE(
+        accessors_are_equal(input_grad, correct_input_grad),
+        check_kv("input_grad", format_accessor_w_contents(input_grad)));
+
+    CHECK_MESSAGE(accessors_are_equal(projection_grad, correct_projection_grad),
+                  check_kv("projection_grad",
+                           format_accessor_w_contents(projection_grad)));
+
+    CHECK_MESSAGE(accessors_are_equal(bias_grad, correct_bias_grad),
+                  check_kv("bias_grad", format_accessor_w_contents(bias_grad)));
+  }
+}
diff --git a/lib/kernels/test/src/kernels/map_tensor_accessors.cc b/lib/kernels/test/src/kernels/map_tensor_accessors.cc
index 60d7c76904..9b61786fc0 100644
--- a/lib/kernels/test/src/kernels/map_tensor_accessors.cc
+++ b/lib/kernels/test/src/kernels/map_tensor_accessors.cc
@@ -18,7 +18,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     map_tensor_accessor_inplace(accessor, [](float x) { return x + 1; });
 
     auto at = [&](nonnegative_int r, nonnegative_int c) -> float {
-      return accessor.at<DataType::FLOAT>(FFOrdered{r, c});
+      return accessor.at<DataType::FLOAT>(TensorDimsCoord{FFOrdered{r, c}});
     };
 
     CHECK(at(0_n, 0_n) == 2);
@@ -44,7 +44,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           input, [](float x) { return x + 1; }, cpu_allocator);
 
       auto at = [&](nonnegative_int r, nonnegative_int c) -> float {
-        return result.at<DataType::FLOAT>(FFOrdered{r, c});
+        return result.at<DataType::FLOAT>(TensorDimsCoord{FFOrdered{r, c}});
       };
 
       CHECK(at(0_n, 0_n) == 2);
@@ -60,7 +60,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           input, [](float x) -> bool { return x > 2; }, cpu_allocator);
 
       auto at = [&](nonnegative_int r, nonnegative_int c) -> bool {
-        return result.at<DataType::BOOL>(FFOrdered{r, c});
+        return result.at<DataType::BOOL>(TensorDimsCoord{FFOrdered{r, c}});
       };
 
       CHECK(at(0_n, 0_n) == false);
@@ -99,7 +99,7 @@ TEST_SUITE(FF_TEST_SUITE) {
             cpu_allocator);
 
         auto at = [&](nonnegative_int r, nonnegative_int c) -> float {
-          return result.at<DataType::FLOAT>(FFOrdered{r, c});
+          return result.at<DataType::FLOAT>(TensorDimsCoord{FFOrdered{r, c}});
         };
 
         CHECK(at(0_n, 0_n) == 1);
@@ -119,7 +119,7 @@ TEST_SUITE(FF_TEST_SUITE) {
             cpu_allocator);
 
         auto at = [&](nonnegative_int r, nonnegative_int c) -> bool {
-          return result.at<DataType::BOOL>(FFOrdered{r, c});
+          return result.at<DataType::BOOL>(TensorDimsCoord{FFOrdered{r, c}});
         };
 
         CHECK(at(0_n, 0_n) == true);
@@ -150,7 +150,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           lhs, rhs, DataType::DOUBLE, func, cpu_allocator);
 
       auto at = [&](nonnegative_int r, nonnegative_int c) -> double {
-        return result.at<DataType::DOUBLE>(FFOrdered{r, c});
+        return result.at<DataType::DOUBLE>(TensorDimsCoord{FFOrdered{r, c}});
       };
 
       CHECK(at(0_n, 0_n) == -1);
diff --git a/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc b/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc
index dd5f8e06f6..a269cf4777 100644
--- a/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc
+++ b/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc
@@ -1,5 +1,5 @@
 #include "kernels/reduce_tensor_accessor.h"
-#include "kernels/test_utils.h"
+#include "internal/test_utils.h"
 #include "kernels/create_accessor_with_contents.h"
 #include "kernels/format_accessor_contents.h"
 #include "test/utils/doctest/check_kv.h"
diff --git a/lib/kernels/test/src/cpu/ops/reverse_kernels.cc b/lib/kernels/test/src/kernels/reverse_kernels_cpu.cc
similarity index 99%
rename from lib/kernels/test/src/cpu/ops/reverse_kernels.cc
rename to lib/kernels/test/src/kernels/reverse_kernels_cpu.cc
index 5e27b9d350..98ab88bd3f 100644
--- a/lib/kernels/test/src/cpu/ops/reverse_kernels.cc
+++ b/lib/kernels/test/src/kernels/reverse_kernels_cpu.cc
@@ -1,7 +1,7 @@
-#include "kernels/test_utils.h"
+#include "kernels/reverse_kernels_cpu.h"
+#include "internal/test_utils.h"
 #include "kernels/create_accessor_with_contents.h"
 #include "kernels/format_accessor_contents.h"
-#include "kernels/reverse_kernels_cpu.h"
 #include "test/utils/doctest/check_kv.h"
 #include <doctest/doctest.h>
 
diff --git a/lib/kernels/test/src/kernels/tensor_accessor_unary_ops.cc b/lib/kernels/test/src/kernels/tensor_accessor_unary_ops.cc
new file mode 100644
index 0000000000..e4660f4fc4
--- /dev/null
+++ b/lib/kernels/test/src/kernels/tensor_accessor_unary_ops.cc
@@ -0,0 +1,178 @@
+#include "kernels/tensor_accessor_unary_ops.h"
+#include "internal/test_utils.h"
+#include "kernels/create_accessor_with_contents.h"
+#include "kernels/format_accessor_contents.h"
+#include "test/utils/doctest/check_kv.h"
+#include "utils/containers/repeat_element.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("tensor_accessor_scale_by_constant") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorR input = create_2d_accessor_r_with_contents<float>(
+        {
+            {3, 3, 6},
+            {0, -1, 0.75},
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorW result =
+        tensor_accessor_scale_by_constant(input, -2.0, cpu_allocator);
+
+    GenericTensorAccessorR correct = create_2d_accessor_r_with_contents<float>(
+        {
+            {-6, -6, -12},
+            {0, 2, -1.5},
+        },
+        cpu_allocator);
+
+    CHECK_MESSAGE(accessors_are_equal(result, correct),
+                  check_kv("result", format_accessor_w_contents(result)));
+  }
+
+  TEST_CASE("tensor_accessor_relu") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorR input = create_2d_accessor_r_with_contents<float>(
+        {
+            {3, -3, -6},
+            {0, -1, 0.75},
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorW result = tensor_accessor_relu(input, cpu_allocator);
+
+    GenericTensorAccessorR correct = create_2d_accessor_r_with_contents<float>(
+        {
+            {3, 0, 0},
+            {0, 0, 0.75},
+        },
+        cpu_allocator);
+
+    CHECK_MESSAGE(accessors_are_equal(result, correct),
+                  check_kv("result", format_accessor_w_contents(result)));
+  }
+
+  TEST_CASE("tensor_accessor_scale_by_constant_inplace") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorW input = create_2d_accessor_w_with_contents<float>(
+        {
+            {3, 3, 6},
+            {0, -1, 0.75},
+        },
+        cpu_allocator);
+
+    tensor_accessor_scale_by_constant_inplace(input, -2.0);
+
+    GenericTensorAccessorR correct = create_2d_accessor_r_with_contents<float>(
+        {
+            {-6, -6, -12},
+            {0, 2, -1.5},
+        },
+        cpu_allocator);
+
+    CHECK_MESSAGE(accessors_are_equal(input, correct),
+                  check_kv("result", format_accessor_w_contents(input)));
+  }
+
+  TEST_CASE("tensor_accessor_broadcast") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorR input = create_2d_accessor_r_with_contents<float>(
+        {
+            {3},
+            {-0.5},
+            {6},
+        },
+        cpu_allocator);
+
+    TensorDims output_dims = TensorDims{
+        FFOrdered{4_p, 1_p, 3_p, 2_p},
+    };
+
+    GenericTensorAccessorW result =
+        tensor_accessor_broadcast(input, output_dims, cpu_allocator);
+
+    GenericTensorAccessorR correct = create_4d_accessor_r_with_contents<float>(
+        repeat_element(4_n,
+                       std::vector<std::vector<std::vector<float>>>{
+                           std::vector<std::vector<float>>{
+                               repeat_element<float>(2_n, 3.0),
+                               repeat_element<float>(2_n, -0.5),
+                               repeat_element<float>(2_n, 6.0),
+                           }}),
+        cpu_allocator);
+
+    CHECK_MESSAGE(accessors_are_equal(result, correct),
+                  check_kv("input", format_accessor_r_contents(input)),
+                  check_kv("result", format_accessor_w_contents(result)));
+  }
+
+  TEST_CASE("tensor_accessor_transpose") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorR input = create_2d_accessor_r_with_contents<float>(
+        {
+            {3, 3, 6},
+            {0, -1, 0.75},
+        },
+        cpu_allocator);
+
+    GenericTensorAccessorW result =
+        tensor_accessor_transpose(input, cpu_allocator);
+
+    GenericTensorAccessorR correct = create_2d_accessor_r_with_contents<float>(
+        {
+            {3, 0},
+            {3, -1},
+            {6, 0.75},
+        },
+        cpu_allocator);
+
+    CHECK_MESSAGE(accessors_are_equal(result, correct),
+                  check_kv("result", format_accessor_w_contents(result)));
+  }
+
+  TEST_CASE("tensor_accessor_reduce") {
+    Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+    GenericTensorAccessorR input = create_2d_accessor_r_with_contents<float>(
+        {
+            {3, 3, 6},
+            {0, -1, 0.75},
+        },
+        cpu_allocator);
+
+    SUBCASE("inner dim") {
+      GenericTensorAccessorW result =
+          tensor_accessor_reduce(input, ff_dim_t{1_n}, cpu_allocator);
+
+      GenericTensorAccessorR correct =
+          create_1d_accessor_r_with_contents<float>(
+              {
+                  3 + 3 + 6,
+                  0 + (-1) + 0.75,
+              },
+              cpu_allocator);
+
+      CHECK_MESSAGE(accessors_are_equal(result, correct),
+                    check_kv("result", format_accessor_w_contents(result)));
+    }
+
+    SUBCASE("outer_dim") {
+      GenericTensorAccessorW result =
+          tensor_accessor_reduce(input, ff_dim_t{0_n}, cpu_allocator);
+
+      GenericTensorAccessorR correct =
+          create_1d_accessor_r_with_contents<float>(
+              {(3 + 0), (3 + (-1)), (6 + 0.75)}, cpu_allocator);
+
+      CHECK_MESSAGE(accessors_are_equal(result, correct),
+                    check_kv("result", format_accessor_w_contents(result)));
+    }
+  }
+}
diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc
index a086974a74..a313de72d5 100644
--- a/lib/kernels/test/src/test_attention_kernel.cc
+++ b/lib/kernels/test/src/test_attention_kernel.cc
@@ -1,5 +1,5 @@
-#include "kernels/test_utils.h"
-#include "kernels/attention_kernels.h"
+#include "internal/test_utils.h"
+#include "kernels/attention_kernels_gpu.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
@@ -25,7 +25,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    MHAPerDeviceState state = Kernels::MultiHeadAttention::init_kernel(
+    MHAPerDeviceState state = Kernels::MultiHeadAttention::gpu_init_kernel(
         managed_handle.raw_handle(),
         allocator,
         /*num_samples=*/num_samples.int_from_positive_int(),
@@ -71,11 +71,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     GenericTensorAccessorW weight_accessor =
         create_random_filled_accessor_w(weight_shape, allocator);
 
-    SUBCASE("forward_kernel") {
+    SUBCASE("gpu_forward_kernel") {
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
-      Kernels::MultiHeadAttention::forward_kernel(
+      Kernels::MultiHeadAttention::gpu_forward_kernel(
           managed_stream.raw_stream(),
           state,
           query_accessor.get_float_ptr(),
@@ -87,7 +87,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       CHECK(contains_non_zero(output_accessor));
     }
 
-    SUBCASE("backward_kernel") {
+    SUBCASE("gpu_backward_kernel") {
       GenericTensorAccessorW query_grad_accessor =
           create_random_filled_accessor_w(query_shape, allocator);
       GenericTensorAccessorW key_grad_accessor =
@@ -99,7 +99,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       GenericTensorAccessorW output_grad_accessor =
           create_random_filled_accessor_w(output_shape, allocator);
 
-      Kernels::MultiHeadAttention::backward_kernel(
+      Kernels::MultiHeadAttention::gpu_backward_kernel(
           managed_stream.raw_stream(),
           state,
           query_accessor.get_float_ptr(),
@@ -113,6 +113,6 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
           output_grad_accessor.get_float_ptr());
     }
 
-    Kernels::MultiHeadAttention::cleanup_kernel(allocator, state);
+    Kernels::MultiHeadAttention::gpu_cleanup_kernel(allocator, state);
   }
 }
diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc
index b0fe356c95..8a904b7a0d 100644
--- a/lib/kernels/test/src/test_batch_matmul_kernel.cc
+++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc
@@ -1,5 +1,5 @@
-#include "kernels/test_utils.h"
-#include "kernels/batch_matmul_kernels.h"
+#include "internal/test_utils.h"
+#include "kernels/batch_matmul_kernels_gpu.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
@@ -41,22 +41,22 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     GenericTensorAccessorW output_accessor =
         create_random_filled_accessor_w(output_shape, allocator);
 
-    SUBCASE("forward_kernel") {
-      Kernels::BatchMatmul::forward_kernel(managed_stream.raw_stream(),
-                                           managed_handle.raw_handle(),
-                                           output_accessor.get_float_ptr(),
-                                           a_accessor.get_float_ptr(),
-                                           b_accessor.get_float_ptr(),
-                                           m.int_from_positive_int(),
-                                           n.int_from_positive_int(),
-                                           k.int_from_positive_int(),
-                                           batch.int_from_positive_int(),
-                                           a_seq_length_dim,
-                                           b_seq_length_dim,
-                                           seq_length);
+    SUBCASE("gpu_forward_kernel") {
+      Kernels::BatchMatmul::gpu_forward_kernel(managed_stream.raw_stream(),
+                                               managed_handle.raw_handle(),
+                                               output_accessor.get_float_ptr(),
+                                               a_accessor.get_float_ptr(),
+                                               b_accessor.get_float_ptr(),
+                                               m.int_from_positive_int(),
+                                               n.int_from_positive_int(),
+                                               k.int_from_positive_int(),
+                                               batch.int_from_positive_int(),
+                                               a_seq_length_dim,
+                                               b_seq_length_dim,
+                                               seq_length);
     }
 
-    SUBCASE("backward_kernel") {
+    SUBCASE("gpu_backward_kernel") {
       GenericTensorAccessorW o_grad_accessor =
           create_random_filled_accessor_w(output_shape, allocator);
       GenericTensorAccessorW a_grad_accessor =
@@ -64,18 +64,18 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       GenericTensorAccessorW b_grad_accessor =
           allocator.allocate_tensor(input_shape_b);
 
-      Kernels::BatchMatmul::backward_kernel(managed_stream.raw_stream(),
-                                            managed_handle.raw_handle(),
-                                            output_accessor.get_float_ptr(),
-                                            o_grad_accessor.get_float_ptr(),
-                                            a_accessor.get_float_ptr(),
-                                            a_grad_accessor.get_float_ptr(),
-                                            b_accessor.get_float_ptr(),
-                                            b_grad_accessor.get_float_ptr(),
-                                            m.int_from_positive_int(),
-                                            n.int_from_positive_int(),
-                                            k.int_from_positive_int(),
-                                            batch.int_from_positive_int());
+      Kernels::BatchMatmul::gpu_backward_kernel(managed_stream.raw_stream(),
+                                                managed_handle.raw_handle(),
+                                                output_accessor.get_float_ptr(),
+                                                o_grad_accessor.get_float_ptr(),
+                                                a_accessor.get_float_ptr(),
+                                                a_grad_accessor.get_float_ptr(),
+                                                b_accessor.get_float_ptr(),
+                                                b_grad_accessor.get_float_ptr(),
+                                                m.int_from_positive_int(),
+                                                n.int_from_positive_int(),
+                                                k.int_from_positive_int(),
+                                                batch.int_from_positive_int());
     }
   }
 }
diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc
index c173fd6d24..1be45d8bbb 100644
--- a/lib/kernels/test/src/test_batch_norm_kernel.cc
+++ b/lib/kernels/test/src/test_batch_norm_kernel.cc
@@ -1,5 +1,5 @@
-#include "kernels/test_utils.h"
-#include "kernels/batch_norm_kernels.h"
+#include "internal/test_utils.h"
+#include "kernels/batch_norm_kernels_gpu.h"
 #include "op-attrs/datatype_value.h"
 #include <doctest/doctest.h>
 
@@ -19,7 +19,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    BatchNormPerDeviceState state = Kernels::BatchNorm::init_kernel(
+    BatchNormPerDeviceState state = Kernels::BatchNorm::gpu_init_kernel(
         /*handle=*/managed_handle.raw_handle(),
         /*allocator=*/allocator,
         /*runningMean=*/nullptr,
@@ -53,11 +53,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     GenericTensorAccessorW scale_accessor = create_filled_accessor_w(
         scale_shape, allocator, make_float_data_type_value(1));
 
-    SUBCASE("forward_kernel") {
+    SUBCASE("gpu_forward_kernel") {
       GenericTensorAccessorW bias_accessor = create_filled_accessor_w(
           bias_shape, allocator, make_float_data_type_value(0));
 
-      Kernels::BatchNorm::forward_kernel(
+      Kernels::BatchNorm::gpu_forward_kernel(
           /*stream=*/managed_stream.raw_stream(),
           /*per_device_state=*/state,
           /*input_ptr=*/input_accessor.get_float_ptr(),
@@ -68,7 +68,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       CHECK(contains_non_zero(output_accessor));
     }
 
-    SUBCASE("backward_kernel") {
+    SUBCASE("gpu_backward_kernel") {
       GenericTensorAccessorW output_grad_accessor =
           create_random_filled_accessor_w(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
@@ -78,7 +78,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       GenericTensorAccessorW bias_grad_accessor =
           create_random_filled_accessor_w(bias_shape, allocator);
 
-      Kernels::BatchNorm::backward_kernel(
+      Kernels::BatchNorm::gpu_backward_kernel(
           /*stream=*/managed_stream.raw_stream(),
           /*per_device_state=*/state,
           /*output_ptr=*/output_accessor.get_float_ptr(),
@@ -89,19 +89,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
           /*scale_grad_ptr=*/scale_grad_accessor.get_float_ptr(),
           /*bias_grad_ptr=*/bias_grad_accessor.get_float_ptr(),
           /*numElements=*/
-          input_accessor.shape.num_elements().int_from_positive_int());
+          get_num_elements(input_accessor.shape.dims).int_from_positive_int());
 
       CHECK(contains_non_zero(input_grad_accessor));
       CHECK(contains_non_zero(scale_grad_accessor));
       CHECK(contains_non_zero(bias_grad_accessor));
     }
 
-    Kernels::BatchNorm::cleanup_kernel(allocator,
-                                       state.inputTensor,
-                                       state.biasTensor,
-                                       state.outputTensor,
-                                       state.actiDesc,
-                                       true,
-                                       state.runningMean);
+    Kernels::BatchNorm::gpu_cleanup_kernel(allocator, state);
   }
 }
diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc
index 9472e44a15..5657db71ef 100644
--- a/lib/kernels/test/src/test_cast_kernel.cc
+++ b/lib/kernels/test/src/test_cast_kernel.cc
@@ -1,6 +1,7 @@
-#include "kernels/test_utils.h"
+#include "internal/test_utils.h"
 #include "kernels/cast_kernels.h"
 #include "kernels/cast_kernels_cpu.h"
+#include "kernels/cast_kernels_gpu.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
@@ -19,27 +20,27 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         DataType::DOUBLE,
     };
 
-    SUBCASE("forward_kernel") {
+    SUBCASE("gpu_forward_kernel") {
       GenericTensorAccessorR input_accessor =
           create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
-      Kernels::Cast::forward_kernel(
+      Kernels::Cast::gpu_forward_kernel(
           managed_stream.raw_stream(), input_accessor, output_accessor);
 
       CHECK(contains_non_zero(output_accessor));
     }
 
-    SUBCASE("backward_kernel") {
+    SUBCASE("gpu_backward_kernel") {
       GenericTensorAccessorR grad_output_accessor =
           create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW grad_input_accessor =
           create_zero_filled_accessor_w(input_shape, allocator);
 
-      Kernels::Cast::backward_kernel(managed_stream.raw_stream(),
-                                     grad_output_accessor,
-                                     grad_input_accessor);
+      Kernels::Cast::gpu_backward_kernel(managed_stream.raw_stream(),
+                                         grad_output_accessor,
+                                         grad_input_accessor);
 
       CHECK(contains_non_zero(grad_input_accessor));
     }
@@ -68,7 +69,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       GenericTensorAccessorW output_accessor_gpu =
           create_zero_filled_accessor_w(output_shape, gpu_allocator);
 
-      Kernels::Cast::forward_kernel(
+      Kernels::Cast::gpu_forward_kernel(
           managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
 
       // Run CPU Forward Kernel
diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc
deleted file mode 100644
index 7ac4d0f881..0000000000
--- a/lib/kernels/test/src/test_combine_kernel.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-#include "kernels/test_utils.h"
-#include "kernels/combine_kernels.h"
-#include "kernels/combine_kernels_cpu.h"
-#include <doctest/doctest.h>
-
-using namespace ::FlexFlow;
-TEST_SUITE(FF_CUDA_TEST_SUITE) {
-  TEST_CASE("Call Combine Forward and Backward Kernels") {
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-        /*workSpaceSize=*/1024 * 1024,
-        /*allowTensorOpMathConversion=*/true);
-    ManagedFFStream managed_stream{};
-
-    Allocator allocator = create_local_cuda_memory_allocator();
-
-    TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered{100_p, 100_p}},
-        DataType::FLOAT,
-    };
-    TensorShape output_shape = input_shape;
-
-    SUBCASE("forward_kernel") {
-      GenericTensorAccessorR input_accessor =
-          create_random_filled_accessor_r(input_shape, allocator);
-      GenericTensorAccessorW output_accessor =
-          allocator.allocate_tensor(output_shape);
-
-      Kernels::Combine::forward_kernel(
-          managed_stream.raw_stream(), input_accessor, output_accessor);
-
-      CHECK(contains_non_zero(output_accessor));
-    }
-
-    SUBCASE("backward_kernel") {
-      GenericTensorAccessorR output_grad_accessor =
-          create_random_filled_accessor_r(output_shape, allocator);
-      GenericTensorAccessorW input_grad_accessor =
-          allocator.allocate_tensor(input_shape);
-
-      Kernels::Combine::backward_kernel(managed_stream.raw_stream(),
-                                        output_grad_accessor,
-                                        input_grad_accessor);
-
-      CHECK(contains_non_zero(input_grad_accessor));
-    }
-  }
-
-  TEST_CASE("Check Combine Forward Kernel against CPU Kernel") {
-    ManagedFFStream managed_stream{};
-
-    Allocator gpu_allocator = create_local_cuda_memory_allocator();
-    Allocator cpu_allocator = create_local_cpu_memory_allocator();
-
-    TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered{5_p, 5_p}},
-        DataType::FLOAT,
-    };
-    TensorShape output_shape = input_shape;
-
-    SUBCASE("forward_kernel") {
-      // Run GPU Combine Forward Kernel
-      GenericTensorAccessorR input_accessor_gpu =
-          create_random_filled_accessor_r(input_shape, gpu_allocator);
-      GenericTensorAccessorW output_accessor_gpu =
-          gpu_allocator.allocate_tensor(output_shape);
-
-      Kernels::Combine::forward_kernel(
-          managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
-
-      // Run CPU Combine Forward Kernel
-      GenericTensorAccessorR input_accessor_cpu =
-          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
-      GenericTensorAccessorW output_accessor_cpu =
-          cpu_allocator.allocate_tensor(output_shape);
-
-      Kernels::Combine::cpu_forward_kernel(input_accessor_cpu,
-                                           output_accessor_cpu);
-
-      CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu));
-    }
-
-    SUBCASE("backward_kernel") {
-      // Run GPU Combine Backward Kernel
-      GenericTensorAccessorR output_grad_accessor_gpu =
-          create_random_filled_accessor_r(output_shape, gpu_allocator);
-      GenericTensorAccessorW input_grad_accessor_gpu =
-          create_zero_filled_accessor_w(input_shape, gpu_allocator);
-
-      Kernels::Combine::backward_kernel(managed_stream.raw_stream(),
-                                        output_grad_accessor_gpu,
-                                        input_grad_accessor_gpu);
-
-      // Run CPU Combine Backward Kernel
-      GenericTensorAccessorR output_grad_accessor_cpu =
-          copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator);
-      GenericTensorAccessorW input_grad_accessor_cpu =
-          create_zero_filled_accessor_w(input_shape, cpu_allocator);
-
-      Kernels::Combine::cpu_backward_kernel(output_grad_accessor_cpu,
-                                            input_grad_accessor_cpu);
-
-      CHECK(accessors_are_equal(input_grad_accessor_gpu,
-                                input_grad_accessor_cpu));
-    }
-  }
-}
diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc
index 5dc8e441bd..e3fdd3ad61 100644
--- a/lib/kernels/test/src/test_concat_kernel.cc
+++ b/lib/kernels/test/src/test_concat_kernel.cc
@@ -1,5 +1,5 @@
-#include "kernels/test_utils.h"
-#include "kernels/concat_kernels.h"
+#include "internal/test_utils.h"
+#include "kernels/concat_kernels_gpu.h"
 #include "utils/containers/repeat.h"
 #include <doctest/doctest.h>
 
@@ -14,7 +14,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     const positive_int num_inputs = 4_p;
 
-    SUBCASE("forward_kernel") {
+    SUBCASE("gpu_forward_kernel") {
       auto run_forward_test = [&](positive_int input_rows,
                                   positive_int input_cols,
                                   TensorShape output_shape,
@@ -32,10 +32,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         GenericTensorAccessorW output_accessor =
             allocator.allocate_tensor(output_shape);
 
-        Kernels::Concat::forward_kernel(managed_stream.raw_stream(),
-                                        output_accessor,
-                                        input_accessors,
-                                        concat_axis);
+        Kernels::Concat::gpu_forward_kernel(managed_stream.raw_stream(),
+                                            output_accessor,
+                                            input_accessors,
+                                            concat_axis);
 
         CHECK(contains_non_zero(output_accessor));
       };
@@ -61,7 +61,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       }
     }
 
-    SUBCASE("backward_kernel") {
+    SUBCASE("gpu_backward_kernel") {
       auto run_backward_test = [&](positive_int input_rows,
                                    positive_int input_cols,
                                    TensorShape output_shape,
@@ -79,10 +79,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
               return create_zero_filled_accessor_w(input_shape, allocator);
             });
 
-        Kernels::Concat::backward_kernel(managed_stream.raw_stream(),
-                                         output_grad_accessor,
-                                         input_grad_accessors,
-                                         concat_axis);
+        Kernels::Concat::gpu_backward_kernel(managed_stream.raw_stream(),
+                                             output_grad_accessor,
+                                             input_grad_accessors,
+                                             concat_axis);
 
         for (auto &accessor : input_grad_accessors) {
           CHECK(contains_non_zero(accessor));
diff --git a/lib/kernels/test/src/test_cuda.cc b/lib/kernels/test/src/test_cuda.cc
index 60bc6251b2..de3215cf2d 100644
--- a/lib/kernels/test/src/test_cuda.cc
+++ b/lib/kernels/test/src/test_cuda.cc
@@ -1,4 +1,4 @@
-#include "kernels/test_utils.h"
+#include "internal/test_utils.h"
 #include <doctest/doctest.h>
 
 #include <random>
diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc
index fb8b8dc87c..f6048e6771 100644
--- a/lib/kernels/test/src/test_dropout.cc
+++ b/lib/kernels/test/src/test_dropout.cc
@@ -1,5 +1,5 @@
-#include "kernels/test_utils.h"
-#include "kernels/dropout_kernels.h"
+#include "internal/test_utils.h"
+#include "kernels/dropout_kernels_gpu.h"
 #include "utils/containers/count.h"
 #include <doctest/doctest.h>
 
@@ -9,10 +9,6 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     unsigned long long seed = 12345;
     float dropout_rate = 0.1;
 
-    ArrayShape shape = ArrayShape{
-        std::vector{10_p, 10_p},
-    };
-
     TensorShape input_shape = TensorShape{
         TensorDims{FFOrdered{10_p, 10_p}},
         DataType::FLOAT,
@@ -26,8 +22,12 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    DropoutPerDeviceState state = Kernels::Dropout::init_kernel(
-        managed_handle.raw_handle(), dropout_rate, seed, shape, allocator);
+    DropoutPerDeviceState state =
+        Kernels::Dropout::gpu_init_kernel(managed_handle.raw_handle(),
+                                          dropout_rate,
+                                          seed,
+                                          output_shape,
+                                          allocator);
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorR input_accessor =
@@ -35,10 +35,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
-      Kernels::Dropout::forward_kernel(managed_stream.raw_stream(),
-                                       state,
-                                       input_accessor.get_float_ptr(),
-                                       output_accessor.get_float_ptr());
+      Kernels::Dropout::gpu_forward_kernel(managed_stream.raw_stream(),
+                                           state,
+                                           input_accessor.get_float_ptr(),
+                                           output_accessor.get_float_ptr());
 
       CHECK(contains_non_zero(output_accessor));
     }
@@ -49,16 +49,12 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       GenericTensorAccessorW input_grad_data =
           create_random_filled_accessor_w(input_shape, allocator);
 
-      Kernels::Dropout::backward_kernel(managed_stream.raw_stream(),
-                                        state,
-                                        output_grad_data.get_float_ptr(),
-                                        input_grad_data.get_float_ptr());
+      Kernels::Dropout::gpu_backward_kernel(managed_stream.raw_stream(),
+                                            state,
+                                            output_grad_data.get_float_ptr(),
+                                            input_grad_data.get_float_ptr());
     }
 
-    Kernels::Dropout::cleanup_kernel(allocator,
-                                     state.inputTensor,
-                                     state.outputTensor,
-                                     state.dropoutDesc,
-                                     state.dropoutStates);
+    Kernels::Dropout::gpu_cleanup_kernel(allocator, state);
   }
 }
diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc
index cea07ce781..71831ced48 100644
--- a/lib/kernels/test/src/test_flat_kernel.cc
+++ b/lib/kernels/test/src/test_flat_kernel.cc
@@ -1,5 +1,5 @@
-#include "kernels/test_utils.h"
-#include "kernels/flat_kernels.h"
+#include "internal/test_utils.h"
+#include "kernels/flat_kernels_gpu.h"
 #include "op-attrs/datatype_value.h"
 #include <doctest/doctest.h>
 
@@ -23,27 +23,27 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         read_only_accessor_from_write_accessor(create_filled_accessor_w(
             input_shape, allocator, make_float_data_type_value(2)));
 
-    SUBCASE("forward_kernel") {
+    SUBCASE("gpu_forward_kernel") {
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
-      Kernels::Flat::forward_kernel(managed_stream.raw_stream(),
-                                    input_accessor,
-                                    output_accessor.get_float_ptr());
+      Kernels::Flat::gpu_forward_kernel(managed_stream.raw_stream(),
+                                        input_accessor,
+                                        output_accessor.get_float_ptr());
 
       CHECK(contains_non_zero(output_accessor));
     }
 
-    SUBCASE("backward_kernel") {
+    SUBCASE("gpu_backward_kernel") {
       GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r(
           output_shape, allocator, make_float_data_type_value(0));
       GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w(
           input_shape, allocator, make_float_data_type_value(1));
 
-      Kernels::Flat::backward_kernel(managed_stream.raw_stream(),
-                                     input_accessor,
-                                     output_grad_accessor.get_float_ptr(),
-                                     input_grad_accessor.get_float_ptr());
+      Kernels::Flat::gpu_backward_kernel(managed_stream.raw_stream(),
+                                         input_accessor,
+                                         output_grad_accessor.get_float_ptr(),
+                                         input_grad_accessor.get_float_ptr());
 
       CHECK(contains_non_zero(input_grad_accessor));
     }
diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc
index 6a553bd107..d08058b063 100644
--- a/lib/kernels/test/src/test_gather_kernels.cc
+++ b/lib/kernels/test/src/test_gather_kernels.cc
@@ -1,5 +1,5 @@
-#include "kernels/test_utils.h"
-#include "kernels/gather_kernels.h"
+#include "internal/test_utils.h"
+#include "kernels/gather_kernels_gpu.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
@@ -13,13 +13,17 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    GatherPerDeviceState state = {managed_handle.raw_handle(),
-                                  legion_dim_t{0_n}};
-
-    SUBCASE("forward_kernel") {
+    SUBCASE("gpu_forward_kernel") {
       auto run_forward_test = [&](TensorShape input_shape,
                                   TensorShape index_shape,
                                   TensorShape output_shape) {
+        ff_dim_t dim = ff_dim_t{
+            nonnegative_int{
+                get_num_dims(input_shape.dims).unwrap_nonnegative() - 1},
+        };
+        GatherPerDeviceState state =
+            Kernels::Gather::gpu_init_kernel(managed_handle.raw_handle(), dim);
+
         GenericTensorAccessorR input_accessor =
             create_random_filled_accessor_r(input_shape, allocator);
         GenericTensorAccessorR index_accessor =
@@ -27,11 +31,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         GenericTensorAccessorW output_accessor =
             allocator.allocate_tensor(output_shape);
 
-        Kernels::Gather::forward_kernel(managed_stream.raw_stream(),
-                                        state,
-                                        input_accessor,
-                                        index_accessor,
-                                        output_accessor);
+        Kernels::Gather::gpu_forward_kernel(managed_stream.raw_stream(),
+                                            state,
+                                            input_accessor,
+                                            index_accessor,
+                                            output_accessor);
 
         CHECK(contains_non_zero(output_accessor));
       };
@@ -69,10 +73,17 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       }
     }
 
-    SUBCASE("backward_kernel") {
+    SUBCASE("gpu_backward_kernel") {
       auto run_backward_test = [&](TensorShape input_shape,
                                    TensorShape index_shape,
                                    TensorShape output_shape) {
+        ff_dim_t dim = ff_dim_t{
+            nonnegative_int{
+                get_num_dims(input_shape.dims).unwrap_nonnegative() - 1},
+        };
+        GatherPerDeviceState state =
+            Kernels::Gather::gpu_init_kernel(managed_handle.raw_handle(), dim);
+
         GenericTensorAccessorR output_grad_accessor =
             create_random_filled_accessor_r(output_shape, allocator);
         GenericTensorAccessorR index_accessor =
@@ -80,11 +91,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         GenericTensorAccessorW input_grad_accessor =
             allocator.allocate_tensor(input_shape);
 
-        Kernels::Gather::backward_kernel(managed_stream.raw_stream(),
-                                         state,
-                                         output_grad_accessor,
-                                         index_accessor,
-                                         input_grad_accessor);
+        Kernels::Gather::gpu_backward_kernel(managed_stream.raw_stream(),
+                                             state,
+                                             output_grad_accessor,
+                                             index_accessor,
+                                             input_grad_accessor);
         CHECK(contains_non_zero(input_grad_accessor));
       };
 
diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc
index 5382bb3a84..e6796b5768 100644
--- a/lib/kernels/test/src/test_layer_norm_kernels.cc
+++ b/lib/kernels/test/src/test_layer_norm_kernels.cc
@@ -1,5 +1,5 @@
-#include "kernels/test_utils.h"
-#include "kernels/layer_norm_kernels.h"
+#include "internal/test_utils.h"
+#include "kernels/layer_norm_kernels_gpu.h"
 #include "op-attrs/datatype_value.h"
 #include <doctest/doctest.h>
 
@@ -29,34 +29,34 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    LayerNormPerDeviceState state =
-        Kernels::LayerNorm::init_kernel(managed_handle.raw_handle(),
-                                        allocator,
-                                        elementwise_affine,
-                                        batch_size.int_from_positive_int(),
-                                        feature_size.int_from_positive_int(),
-                                        epsilon);
+    LayerNormPerDeviceState state = Kernels::LayerNorm::gpu_init_kernel(
+        managed_handle.raw_handle(),
+        allocator,
+        elementwise_affine,
+        batch_size.int_from_positive_int(),
+        feature_size.int_from_positive_int(),
+        epsilon);
 
     GenericTensorAccessorR input_accessor =
         create_random_filled_accessor_r(input_shape, allocator);
     GenericTensorAccessorW gamma_accessor = create_filled_accessor_w(
         feature_shape, allocator, make_float_data_type_value(1));
 
-    SUBCASE("forward_kernel") {
+    SUBCASE("gpu_forward_kernel") {
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
       GenericTensorAccessorW beta_accessor = create_filled_accessor_w(
           feature_shape, allocator, make_float_data_type_value(0));
 
-      Kernels::LayerNorm::forward_kernel(managed_stream.raw_stream(),
-                                         state,
-                                         input_accessor,
-                                         output_accessor,
-                                         gamma_accessor,
-                                         beta_accessor);
+      Kernels::LayerNorm::gpu_forward_kernel(managed_stream.raw_stream(),
+                                             state,
+                                             input_accessor,
+                                             output_accessor,
+                                             gamma_accessor,
+                                             beta_accessor);
     }
 
-    SUBCASE("backward_kernel") {
+    SUBCASE("gpu_backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
           create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
@@ -66,7 +66,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       GenericTensorAccessorW beta_grad_accessor =
           allocator.allocate_tensor(feature_shape);
 
-      Kernels::LayerNorm::backward_kernel(
+      Kernels::LayerNorm::gpu_backward_kernel(
           managed_stream.raw_stream(),
           state,
           output_grad_accessor,
diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc
deleted file mode 100644
index 25a346446b..0000000000
--- a/lib/kernels/test/src/test_managed_ff_stream.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-#include "kernels/test_utils.h"
-#include "kernels/gather_kernels.h"
-#include <doctest/doctest.h>
-
-using namespace ::FlexFlow;
-
-TEST_SUITE(FF_CUDA_TEST_SUITE) {
-  TEST_CASE("Test ManagedFFStream") {
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-        /*workSpaceSize=*/1024 * 1024,
-        /*allowTensorOpMathConversion=*/true);
-    ManagedFFStream managed_stream{};
-    Allocator allocator = create_local_cuda_memory_allocator();
-
-    GatherPerDeviceState state = {managed_handle.raw_handle(),
-                                  legion_dim_t{0_n}};
-
-    SUBCASE("forward_kernel") {
-      auto run_forward_test = [&](TensorShape const &input_shape,
-                                  TensorShape const &index_shape,
-                                  TensorShape const &output_shape) {
-        GenericTensorAccessorR input_accessor =
-            create_random_filled_accessor_r(input_shape, allocator);
-        GenericTensorAccessorR index_accessor =
-            create_random_filled_accessor_r(index_shape, allocator);
-        GenericTensorAccessorW output_accessor =
-            allocator.allocate_tensor(output_shape);
-
-        Kernels::Gather::forward_kernel(/*stream=*/managed_stream.raw_stream(),
-                                        /*per_device_state=*/state,
-                                        /*input=*/input_accessor,
-                                        /*index=*/index_accessor,
-                                        /*output=*/output_accessor);
-
-        CHECK(contains_non_zero(output_accessor));
-      };
-
-      SUBCASE("test gather forward, 2D") {
-        TensorShape input_shape = TensorShape{
-            TensorDims{FFOrdered{2_p, 100_p}},
-            DataType::FLOAT,
-        };
-        TensorShape index_shape = TensorShape{
-            TensorDims{FFOrdered{2_p, 20_p}},
-            DataType::INT32,
-        };
-        TensorShape output_shape = TensorShape{
-            TensorDims{FFOrdered{2_p, 20_p}},
-            DataType::FLOAT,
-        };
-        run_forward_test(input_shape, index_shape, output_shape);
-      }
-
-      SUBCASE("test gather forward, 1D") {
-        TensorShape input_shape = TensorShape{
-            TensorDims{FFOrdered{100_p}},
-            DataType::FLOAT,
-        };
-        TensorShape index_shape = TensorShape{
-            TensorDims{FFOrdered{10_p}},
-            DataType::INT32,
-        };
-        TensorShape output_shape = TensorShape{
-            TensorDims{FFOrdered{10_p}},
-            DataType::FLOAT,
-        };
-        run_forward_test(input_shape, index_shape, output_shape);
-      }
-    }
-
-    SUBCASE("backward_kernel") {
-      auto run_backward_test = [&](TensorShape const &input_shape,
-                                   TensorShape const &index_shape,
-                                   TensorShape const &output_shape) {
-        GenericTensorAccessorR output_grad_accessor =
-            create_random_filled_accessor_r(output_shape, allocator);
-        GenericTensorAccessorR index_accessor =
-            create_random_filled_accessor_r(index_shape, allocator);
-        GenericTensorAccessorW input_grad_accessor =
-            allocator.allocate_tensor(input_shape);
-
-        Kernels::Gather::backward_kernel(/*stream=*/managed_stream.raw_stream(),
-                                         /*per_device_state=*/state,
-                                         /*output_grad=*/output_grad_accessor,
-                                         /*index=*/index_accessor,
-                                         /*input_grad=*/input_grad_accessor);
-        CHECK(contains_non_zero(input_grad_accessor));
-      };
-
-      SUBCASE("test gather backward, 2D") {
-        TensorShape input_shape = TensorShape{
-            TensorDims{FFOrdered{2_p, 100_p}},
-            DataType::FLOAT,
-        };
-        TensorShape index_shape = TensorShape{
-            TensorDims{FFOrdered{2_p, 25_p}},
-            DataType::INT32,
-        };
-        TensorShape output_shape = TensorShape{
-            TensorDims{FFOrdered{2_p, 25_p}},
-            DataType::FLOAT,
-        };
-        run_backward_test(input_shape, index_shape, output_shape);
-      }
-    }
-  }
-}
diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc
deleted file mode 100644
index c042ae3175..0000000000
--- a/lib/kernels/test/src/test_partition_kernel.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-#include "kernels/test_utils.h"
-#include "kernels/partition_kernels.h"
-#include "op-attrs/datatype_value.h"
-#include <doctest/doctest.h>
-
-using namespace ::FlexFlow;
-
-TEST_SUITE(FF_CUDA_TEST_SUITE) {
-  TEST_CASE("Test Partition Forward and Backward") {
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-        /*workSpaceSize=*/1024 * 1024,
-        /*allowTensorOpMathConversion=*/true);
-    ManagedFFStream managed_stream{};
-
-    Allocator allocator = create_local_cuda_memory_allocator();
-
-    RepartitionPerDeviceState state = Kernels::Repartition::init_kernel(
-        managed_handle.raw_handle(), DataType::FLOAT);
-
-    TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered{10_p, 10_p}},
-        DataType::FLOAT,
-    };
-    TensorShape output_shape = input_shape;
-
-    SUBCASE("forward_kernel") {
-      GenericTensorAccessorR input_accessor = create_filled_accessor_r(
-          input_shape, allocator, make_float_data_type_value(1));
-      GenericTensorAccessorW output_accessor =
-          allocator.allocate_tensor(output_shape);
-
-      Kernels::Repartition::forward_kernel(
-          managed_stream.raw_stream(), state, input_accessor, output_accessor);
-
-      CHECK(contains_non_zero(output_accessor));
-    }
-
-    SUBCASE("backward_kernel") {
-      GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r(
-          output_shape, allocator, make_float_data_type_value(1));
-      GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w(
-          input_shape, allocator, make_float_data_type_value(2));
-
-      Kernels::Repartition::backward_kernel(managed_stream.raw_stream(),
-                                            state,
-                                            output_grad_accessor,
-                                            input_grad_accessor);
-
-      CHECK(contains_non_zero(input_grad_accessor));
-    }
-  }
-}
diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc
index 58fff5c884..5aa32899cb 100644
--- a/lib/kernels/test/src/test_pool_2d_kernels.cc
+++ b/lib/kernels/test/src/test_pool_2d_kernels.cc
@@ -1,5 +1,5 @@
-#include "kernels/test_utils.h"
-#include "kernels/pool_2d_kernels.h"
+#include "internal/test_utils.h"
+#include "kernels/pool_2d_kernels_gpu.h"
 #include "op-attrs/datatype_value.h"
 #include <doctest/doctest.h>
 
@@ -30,7 +30,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    Pool2DPerDeviceState state = Kernels::Pool2D::init_kernel(
+    Pool2DPerDeviceState state = Kernels::Pool2D::gpu_init_kernel(
         /*handle=*/managed_handle.raw_handle(),
         /*activation=*/std::nullopt,
         /*input_w=*/input_w.int_from_positive_int(),
@@ -63,27 +63,27 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     GenericTensorAccessorW output_accessor =
         create_random_filled_accessor_w(output_shape, allocator);
 
-    SUBCASE("forward_kernel") {
-      Kernels::Pool2D::forward_kernel(managed_stream.raw_stream(),
-                                      state,
-                                      input_accessor.ptr,
-                                      output_accessor.ptr);
+    SUBCASE("gpu_forward_kernel") {
+      Kernels::Pool2D::gpu_forward_kernel(managed_stream.raw_stream(),
+                                          state,
+                                          input_accessor.ptr,
+                                          output_accessor.ptr);
 
       CHECK(contains_non_zero(output_accessor));
     }
 
-    SUBCASE("backward_kernel") {
+    SUBCASE("gpu_backward_kernel") {
       GenericTensorAccessorW output_grad_accessor = create_filled_accessor_w(
           output_shape, allocator, make_float_data_type_value(1));
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
-      Kernels::Pool2D::backward_kernel(managed_stream.raw_stream(),
-                                       state,
-                                       output_accessor.ptr,
-                                       output_grad_accessor.ptr,
-                                       input_accessor.ptr,
-                                       input_grad_accessor.ptr);
+      Kernels::Pool2D::gpu_backward_kernel(managed_stream.raw_stream(),
+                                           state,
+                                           output_accessor.ptr,
+                                           output_grad_accessor.ptr,
+                                           input_accessor.ptr,
+                                           input_grad_accessor.ptr);
 
       CHECK(contains_non_zero(input_grad_accessor));
     }
diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc
deleted file mode 100644
index 4d030c4d93..0000000000
--- a/lib/kernels/test/src/test_reduction_kernel.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-#include "kernels/test_utils.h"
-#include "kernels/reduction_kernels.h"
-#include "op-attrs/datatype_value.h"
-#include <doctest/doctest.h>
-
-using namespace ::FlexFlow;
-TEST_SUITE(FF_CUDA_TEST_SUITE) {
-  TEST_CASE("Test Reduction Forward and Backward Kernel") {
-    std::size_t num_replicas = 5;
-
-    TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered{10_p, 10_p, 10_p, 10_p, 10_p}},
-        DataType::FLOAT,
-    };
-
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-        /*workSpaceSize=*/1024 * 1024,
-        /*allowTensorOpMathConversion=*/true);
-    ManagedFFStream managed_stream{};
-
-    Allocator allocator = create_local_cuda_memory_allocator();
-
-    SUBCASE("forward_kernel") {
-      TensorShape output_shape = TensorShape{
-          TensorDims{FFOrdered{10_p}},
-          DataType::FLOAT,
-      };
-
-      GenericTensorAccessorR input_accessor =
-          create_random_filled_accessor_r(input_shape, allocator);
-      GenericTensorAccessorW output_accessor =
-          allocator.allocate_tensor(output_shape);
-
-      Kernels::Reduction::forward_kernel(managed_stream.raw_stream(),
-                                         input_accessor,
-                                         output_accessor,
-                                         num_replicas);
-
-      CHECK(contains_non_zero(output_accessor));
-    }
-
-    SUBCASE("backward_kernel") {
-      TensorShape output_shape = input_shape;
-
-      GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r(
-          output_shape, allocator, make_float_data_type_value(1));
-      GenericTensorAccessorW input_grad_accessor =
-          allocator.allocate_tensor(input_shape);
-
-      Kernels::Reduction::backward_kernel(managed_stream.raw_stream(),
-                                          output_grad_accessor,
-                                          input_grad_accessor);
-
-      CHECK(contains_non_zero(input_grad_accessor));
-    }
-  }
-}
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
deleted file mode 100644
index 9806cefe8d..0000000000
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ /dev/null
@@ -1,153 +0,0 @@
-#include "kernels/test_utils.h"
-#include "kernels/create_accessor_with_contents.h"
-#include "kernels/format_accessor_contents.h"
-#include "kernels/replicate_kernels.h"
-#include "kernels/replicate_kernels_cpu.h"
-#include "test/utils/doctest/check_kv.h"
-#include <doctest/doctest.h>
-
-using namespace ::FlexFlow;
-
-TEST_SUITE(FF_CUDA_TEST_SUITE) {
-  TEST_CASE("Call Replicate Forward and Backward Kernels") {
-    nonnegative_int num_replicas = 10_n;
-
-    TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered{3_p}},
-        DataType::FLOAT,
-    };
-    TensorShape output_shape = TensorShape{
-        TensorDims{FFOrdered{3_p}},
-        DataType::FLOAT,
-    };
-
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-        /*workSpaceSize=*/1024 * 1024,
-        /*allowTensorOpMathConversion=*/true);
-    ManagedFFStream managed_stream{};
-
-    Allocator gpu_allocator = create_local_cuda_memory_allocator();
-    Allocator cpu_allocator = create_local_cpu_memory_allocator();
-
-    SUBCASE("forward_kernel") {
-      GenericTensorAccessorR input =
-          create_1d_accessor_r_with_contents<float>({1, 3, 2}, gpu_allocator);
-
-      GenericTensorAccessorW output =
-          gpu_allocator.allocate_tensor(output_shape);
-
-      Kernels::Replicate::forward_kernel(
-          managed_stream.raw_stream(), input, output);
-
-      GenericTensorAccessorR correct = input;
-
-      CHECK_MESSAGE(accessors_are_equal(output, correct),
-                    check_kv("output", format_accessor_w_contents(output)));
-    }
-
-    SUBCASE("backward_kernel") {
-      GenericTensorAccessorR output_grad =
-          create_2d_accessor_r_with_contents<float>(
-              {
-                  {1, 2, 3},
-                  {4, 3, 3},
-                  {1, 3, 5},
-              },
-              gpu_allocator);
-
-      GenericTensorAccessorR correct =
-          create_1d_accessor_r_with_contents<float>(
-              {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator);
-
-      GenericTensorAccessorW input_grad =
-          gpu_allocator.allocate_tensor(input_shape);
-
-      Kernels::Replicate::backward_kernel(managed_stream.raw_stream(),
-                                          output_grad,
-                                          input_grad,
-                                          num_replicas.unwrap_nonnegative());
-
-      CHECK_MESSAGE(
-          accessors_are_equal(input_grad, correct),
-          check_kv("input_grad", format_accessor_w_contents(input_grad)));
-    }
-  }
-
-  TEST_CASE("Check Replicate Forward and Backward Kernel against CPU Kernel") {
-    positive_int num_replicas = 2_p;
-
-    TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered{5_p}},
-        DataType::FLOAT,
-    };
-    TensorShape output_shape = TensorShape{
-        TensorDims{FFOrdered{5_p, num_replicas}},
-        DataType::FLOAT,
-    };
-
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-        /*workSpaceSize=*/1024 * 1024,
-        /*allowTensorOpMathConversion=*/true);
-    ManagedFFStream managed_stream{};
-
-    Allocator gpu_allocator = create_local_cuda_memory_allocator();
-    Allocator cpu_allocator = create_local_cpu_memory_allocator();
-
-    SUBCASE("forward_kernel") {
-      // Run GPU Replicate Forward Kernel
-      GenericTensorAccessorR input_accessor_gpu =
-          create_random_filled_accessor_r(input_shape, gpu_allocator);
-      GenericTensorAccessorW output_accessor_gpu =
-          create_zero_filled_accessor_w(output_shape, gpu_allocator);
-
-      Kernels::Replicate::forward_kernel(
-          managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu);
-
-      // Run CPU Replicate Forward Kernel
-      GenericTensorAccessorR input_accessor_cpu =
-          copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator);
-      GenericTensorAccessorW output_accessor_cpu =
-          create_zero_filled_accessor_w(output_shape, cpu_allocator);
-
-      Kernels::Replicate::cpu_forward_kernel(input_accessor_cpu,
-                                             output_accessor_cpu);
-
-      CHECK_MESSAGE(
-          accessors_are_equal(output_accessor_gpu, output_accessor_cpu),
-          check_kv("input", format_accessor_r_contents(input_accessor_cpu)),
-          check_kv("gpu", format_accessor_w_contents(output_accessor_gpu)),
-          check_kv("cpu", format_accessor_w_contents(output_accessor_cpu)));
-    }
-
-    SUBCASE("backward_kernel") {
-      // Run GPU Replicate Backward Kernel
-      GenericTensorAccessorR output_grad_accessor_gpu =
-          create_random_filled_accessor_r(output_shape, gpu_allocator);
-      GenericTensorAccessorW input_grad_accessor_gpu =
-          create_zero_filled_accessor_w(input_shape, gpu_allocator);
-
-      Kernels::Replicate::backward_kernel(managed_stream.raw_stream(),
-                                          output_grad_accessor_gpu,
-                                          input_grad_accessor_gpu,
-                                          num_replicas.int_from_positive_int());
-
-      // Run CPU Replicate Backward Kernel
-      GenericTensorAccessorR output_grad_accessor_cpu =
-          copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator);
-      GenericTensorAccessorW input_grad_accessor_cpu =
-          create_zero_filled_accessor_w(input_shape, cpu_allocator);
-
-      Kernels::Replicate::cpu_backward_kernel(
-          output_grad_accessor_cpu,
-          input_grad_accessor_cpu,
-          num_replicas.int_from_positive_int());
-
-      CHECK_MESSAGE(
-          accessors_are_equal(input_grad_accessor_gpu, input_grad_accessor_cpu),
-          check_kv("output_grad",
-                   format_accessor_r_contents(output_grad_accessor_cpu)),
-          check_kv("gpu", format_accessor_w_contents(input_grad_accessor_gpu)),
-          check_kv("cpu", format_accessor_w_contents(input_grad_accessor_cpu)));
-    }
-  }
-}
diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc
index 011f35e567..ad598c9055 100644
--- a/lib/kernels/test/src/test_reshape_kernel.cc
+++ b/lib/kernels/test/src/test_reshape_kernel.cc
@@ -1,5 +1,5 @@
-#include "kernels/test_utils.h"
-#include "kernels/reshape_kernels.h"
+#include "internal/test_utils.h"
+#include "kernels/reshape_kernels_gpu.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
@@ -16,33 +16,32 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         TensorDims{FFOrdered{100_p}},
         DataType::FLOAT,
     };
-    TensorShape output_shape = input_shape;
-
-    ReshapePerDeviceState state =
-        Kernels::Reshape::init_kernel(DataType::FLOAT);
+    TensorShape output_shape = TensorShape{
+        TensorDims{FFOrdered{100_p}},
+        DataType::INT32,
+    };
 
-    SUBCASE("forward_kernel") {
+    SUBCASE("gpu_forward_kernel") {
       GenericTensorAccessorR input_accessor =
           create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
-      Kernels::Reshape::forward_kernel(
-          managed_stream.raw_stream(), state, input_accessor, output_accessor);
+      Kernels::Reshape::gpu_forward_kernel(
+          managed_stream.raw_stream(), input_accessor, output_accessor);
 
       CHECK(contains_non_zero(output_accessor));
     }
 
-    SUBCASE("backward_kernel") {
+    SUBCASE("gpu_backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
           create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
-      Kernels::Reshape::backward_kernel(managed_stream.raw_stream(),
-                                        state,
-                                        output_grad_accessor,
-                                        input_grad_accessor);
+      Kernels::Reshape::gpu_backward_kernel(managed_stream.raw_stream(),
+                                            output_grad_accessor,
+                                            input_grad_accessor);
 
       CHECK(contains_non_zero(input_grad_accessor));
     }
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index fc5c8deaad..731b530910 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -1,6 +1,6 @@
-#include "kernels/test_utils.h"
-#include "kernels/reverse_kernels.h"
+#include "internal/test_utils.h"
 #include "kernels/reverse_kernels_cpu.h"
+#include "kernels/reverse_kernels_gpu.h"
 #include "op-attrs/datatype_value.h"
 #include <doctest/doctest.h>
 
@@ -24,29 +24,29 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         /*axis=*/ff_dim_t{0_n},
     };
 
-    SUBCASE("forward_kernel") {
+    SUBCASE("gpu_forward_kernel") {
       GenericTensorAccessorR input_accessor =
           read_only_accessor_from_write_accessor(create_filled_accessor_w(
               input_shape, allocator, make_float_data_type_value(1)));
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
-      Kernels::Reverse::forward_kernel(
+      Kernels::Reverse::gpu_forward_kernel(
           managed_stream.raw_stream(), input_accessor, output_accessor, attrs);
 
       CHECK(contains_non_zero(output_accessor));
     }
 
-    SUBCASE("backward_kernel") {
+    SUBCASE("gpu_backward_kernel") {
       GenericTensorAccessorW output_grad_accessor =
           create_random_filled_accessor_w(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
-      Kernels::Reverse::backward_kernel(managed_stream.raw_stream(),
-                                        output_grad_accessor,
-                                        input_grad_accessor,
-                                        attrs);
+      Kernels::Reverse::gpu_backward_kernel(managed_stream.raw_stream(),
+                                            output_grad_accessor,
+                                            input_grad_accessor,
+                                            attrs);
 
       CHECK(contains_non_zero(input_grad_accessor));
     }
@@ -71,17 +71,17 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         /*axis=*/ff_dim_t{0_n},
     };
 
-    SUBCASE("forward_kernel") {
+    SUBCASE("gpu_forward_kernel") {
       // Run GPU Cast Forward Kernel
       GenericTensorAccessorR input_accessor_gpu =
           create_random_filled_accessor_r(input_shape, gpu_allocator);
       GenericTensorAccessorW output_accessor_gpu =
           create_zero_filled_accessor_w(output_shape, gpu_allocator);
 
-      Kernels::Reverse::forward_kernel(managed_stream.raw_stream(),
-                                       input_accessor_gpu,
-                                       output_accessor_gpu,
-                                       attrs);
+      Kernels::Reverse::gpu_forward_kernel(managed_stream.raw_stream(),
+                                           input_accessor_gpu,
+                                           output_accessor_gpu,
+                                           attrs);
 
       // Run CPU Cast Forward Kernel
       GenericTensorAccessorR input_accessor_cpu =
@@ -95,7 +95,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       CHECK(accessors_are_equal(output_accessor_cpu, output_accessor_cpu));
     }
 
-    SUBCASE("backward_kernel") {
+    SUBCASE("gpu_backward_kernel") {
       // Run GPU Cast Backward Kernel
       GenericTensorAccessorR output_grad_accessor_gpu =
           create_random_filled_accessor_r(output_shape, gpu_allocator);
@@ -103,10 +103,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       GenericTensorAccessorW input_grad_accessor_gpu =
           create_zero_filled_accessor_w(input_shape, gpu_allocator);
 
-      Kernels::Reverse::backward_kernel(managed_stream.raw_stream(),
-                                        output_grad_accessor_gpu,
-                                        input_grad_accessor_gpu,
-                                        attrs);
+      Kernels::Reverse::gpu_backward_kernel(managed_stream.raw_stream(),
+                                            output_grad_accessor_gpu,
+                                            input_grad_accessor_gpu,
+                                            attrs);
 
       // Run CPU Cast Backward Kernel
       GenericTensorAccessorR output_grad_accessor_cpu =
diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc
index bb449f6755..ca94bf58d1 100644
--- a/lib/kernels/test/src/test_softmax_kernel.cc
+++ b/lib/kernels/test/src/test_softmax_kernel.cc
@@ -1,5 +1,5 @@
-#include "kernels/test_utils.h"
-#include "kernels/softmax_kernels.h"
+#include "internal/test_utils.h"
+#include "kernels/softmax_kernels_gpu.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
@@ -26,39 +26,40 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     TensorShape output_shape = input_shape;
 
     SoftmaxPerDeviceState state =
-        Kernels::Softmax::init_kernel(managed_handle.raw_handle(),
-                                      0,
-                                      input_n.unwrap_nonnegative(),
-                                      channels.unwrap_nonnegative(),
-                                      input_h.unwrap_nonnegative(),
-                                      input_w.unwrap_nonnegative());
+        Kernels::Softmax::gpu_init_kernel(managed_handle.raw_handle(),
+                                          ff_dim_t{3_n},
+                                          input_n.unwrap_nonnegative(),
+                                          channels.unwrap_nonnegative(),
+                                          input_h.unwrap_nonnegative(),
+                                          input_w.unwrap_nonnegative());
 
     GenericTensorAccessorW output_accessor =
         create_random_filled_accessor_w(output_shape, allocator);
 
-    SUBCASE("forward_kernel") {
+    SUBCASE("gpu_forward_kernel") {
       GenericTensorAccessorW input_accessor =
           create_random_filled_accessor_w(input_shape, allocator);
 
-      Kernels::Softmax::forward_kernel(managed_stream.raw_stream(),
-                                       state,
-                                       input_accessor.get_float_ptr(),
-                                       output_accessor.get_float_ptr());
+      Kernels::Softmax::gpu_forward_kernel(managed_stream.raw_stream(),
+                                           state,
+                                           input_accessor.get_float_ptr(),
+                                           output_accessor.get_float_ptr());
 
       CHECK(contains_non_zero(output_accessor));
     }
 
-    SUBCASE("backward_kernel") {
+    SUBCASE("gpu_backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
           create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           allocator.allocate_tensor(input_shape);
 
-      Kernels::Softmax::backward_kernel(
+      Kernels::Softmax::gpu_backward_kernel(
           managed_stream.raw_stream(),
           output_grad_accessor.get_float_ptr(),
           input_grad_accessor.get_float_ptr(),
-          output_grad_accessor.shape.num_elements().int_from_positive_int());
+          get_num_elements(output_grad_accessor.shape.dims)
+              .int_from_positive_int());
 
       CHECK(contains_non_zero(input_grad_accessor));
     }
diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc
index 2597db95e0..35866308ee 100644
--- a/lib/kernels/test/src/test_split_kernel.cc
+++ b/lib/kernels/test/src/test_split_kernel.cc
@@ -1,5 +1,5 @@
-#include "kernels/test_utils.h"
-#include "kernels/split_kernels.h"
+#include "internal/test_utils.h"
+#include "kernels/split_kernels_gpu.h"
 #include "op-attrs/datatype_value.h"
 #include "utils/containers/repeat.h"
 #include <doctest/doctest.h>
@@ -9,9 +9,9 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
   TEST_CASE("Test Split Forward and Backward Kernel") {
     nonnegative_int num_outputs = 2_n;
-    coord_t out_blk_sizes[] = {50, 50};
-    coord_t in_blk_size = 100;
-    coord_t num_blks = 1;
+    int out_blk_sizes[] = {50, 50};
+    int in_blk_size = 100;
+    int num_blks = 1;
 
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
         /*workSpaceSize=*/1024 * 1024,
@@ -29,7 +29,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         DataType::FLOAT,
     };
 
-    SUBCASE("forward_kernel") {
+    SUBCASE("gpu_forward_kernel") {
       GenericTensorAccessorW input_accessor =
           create_random_filled_accessor_w(input_shape, allocator);
 
@@ -39,16 +39,16 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         return output_accessor.get_float_ptr();
       });
 
-      Kernels::Split::forward_kernel(managed_stream.raw_stream(),
-                                     output_ptrs.data(),
-                                     input_accessor.get_float_ptr(),
-                                     out_blk_sizes,
-                                     in_blk_size,
-                                     num_blks,
-                                     num_outputs.unwrap_nonnegative());
+      Kernels::Split::gpu_forward_kernel(managed_stream.raw_stream(),
+                                         output_ptrs.data(),
+                                         input_accessor.get_float_ptr(),
+                                         out_blk_sizes,
+                                         in_blk_size,
+                                         num_blks,
+                                         num_outputs.unwrap_nonnegative());
     }
 
-    SUBCASE("backward_kernel") {
+    SUBCASE("gpu_backward_kernel") {
       std::vector<float *> output_grad_ptrs(num_outputs.unwrap_nonnegative());
       for (int i = 0; i < num_outputs; i++) {
         GenericTensorAccessorW output_grad_accessor =
@@ -59,13 +59,14 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
       GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w(
           input_shape, allocator, make_float_data_type_value(0));
 
-      Kernels::Split::backward_kernel(managed_stream.raw_stream(),
-                                      input_grad_accessor.get_float_ptr(),
-                                      (float const **)output_grad_ptrs.data(),
-                                      out_blk_sizes,
-                                      in_blk_size,
-                                      num_blks,
-                                      num_outputs.unwrap_nonnegative());
+      Kernels::Split::gpu_backward_kernel(
+          managed_stream.raw_stream(),
+          input_grad_accessor.get_float_ptr(),
+          (float const **)output_grad_ptrs.data(),
+          out_blk_sizes,
+          in_blk_size,
+          num_blks,
+          num_outputs.unwrap_nonnegative());
     }
   }
 }
diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc
index c0b2d4db5e..9d4809b2cf 100644
--- a/lib/kernels/test/src/test_transpose_kernel.cc
+++ b/lib/kernels/test/src/test_transpose_kernel.cc
@@ -1,5 +1,5 @@
-#include "kernels/test_utils.h"
-#include "kernels/transpose_kernels.h"
+#include "internal/test_utils.h"
+#include "kernels/transpose_kernels_gpu.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
@@ -25,28 +25,28 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     };
     TensorShape output_shape = input_shape;
 
-    SUBCASE("forward_kernel") {
+    SUBCASE("gpu_forward_kernel") {
       GenericTensorAccessorR input_accessor =
           create_random_filled_accessor_r(input_shape, allocator);
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
-      Kernels::Transpose::forward_kernel(
+      Kernels::Transpose::gpu_forward_kernel(
           managed_stream.raw_stream(), attrs, input_accessor, output_accessor);
 
       CHECK(contains_non_zero(output_accessor));
     }
 
-    SUBCASE("backward_kernel") {
+    SUBCASE("gpu_backward_kernel") {
       GenericTensorAccessorR output_grad_accessor =
           create_random_filled_accessor_r(output_shape, allocator);
       GenericTensorAccessorW input_grad_accessor =
           create_random_filled_accessor_w(input_shape, allocator);
 
-      Kernels::Transpose::backward_kernel(managed_stream.raw_stream(),
-                                          attrs,
-                                          output_grad_accessor,
-                                          input_grad_accessor);
+      Kernels::Transpose::gpu_backward_kernel(managed_stream.raw_stream(),
+                                              attrs,
+                                              output_grad_accessor,
+                                              input_grad_accessor);
 
       CHECK(contains_non_zero(input_grad_accessor));
     }
diff --git a/lib/local-execution/CMakeLists.txt b/lib/local-execution/CMakeLists.txt
index db0cf7603f..b75f81fb3e 100644
--- a/lib/local-execution/CMakeLists.txt
+++ b/lib/local-execution/CMakeLists.txt
@@ -14,6 +14,7 @@ ff_add_library(
     task-spec
     pcg
     spdlog
+    compiler
 )
 
 add_subdirectory(test)
diff --git a/lib/local-execution/include/local-execution/allocated_tensors.h b/lib/local-execution/include/local-execution/allocated_tensors.h
deleted file mode 100644
index f3face6ace..0000000000
--- a/lib/local-execution/include/local-execution/allocated_tensors.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_ALLOCATED_TENSORS_H
-#define _FLEXFLOW_LOCAL_EXECUTION_ALLOCATED_TENSORS_H
-
-#include "local-execution/allocated_tensors.dtg.h"
-#include "pcg/computation_graph.h"
-
-namespace FlexFlow {
-
-bool are_allocated_forward_tensors_valid(
-    AllocatedTensors const &,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &);
-bool are_allocated_gradient_tensors_valid(
-    AllocatedTensors const &,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &);
-bool are_allocated_optimizer_tensors_valid(
-    AllocatedTensors const &,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &);
-
-bool are_allocated_tensors_valid(
-    AllocatedTensors const &,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &);
-
-bool is_allocated_tensor_backing_valid(
-    TensorTypeVariant const &,
-    std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> const &,
-    ArrayShape const &);
-
-AllocatedTensors make_empty_allocated_tensors();
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/allocated_tensors.struct.toml b/lib/local-execution/include/local-execution/allocated_tensors.struct.toml
deleted file mode 100644
index 33985b0d74..0000000000
--- a/lib/local-execution/include/local-execution/allocated_tensors.struct.toml
+++ /dev/null
@@ -1,30 +0,0 @@
-namespace = "FlexFlow"
-name = "AllocatedTensors"
-features = [
-  "eq",
-  "fmt",
-]
-
-includes = [
-  "task-spec/tensor_type_t.dtg.h",
-  "kernels/accessor.h"
-]
-
-src_includes = [
-  "utils/hash/unordered_map.h",
-  "utils/fmt/unordered_map.h",
-  "utils/hash/vector.h",
-  "utils/fmt/vector.h"
-]
-
-[[fields]]
-name = "tensor_type_backings"
-type = "std::unordered_map<::FlexFlow::TensorTypeVariant, ::FlexFlow::GenericTensorAccessorW>"
-
-[[fields]]
-name = "gradient_mapping"
-type = "std::unordered_map<::FlexFlow::tensor_guid_t, ::FlexFlow::gradient_tensor_t>"
-
-[[fields]]
-name = "optimizer_mapping"
-type = "std::unordered_map<::FlexFlow::tensor_guid_t, std::vector<::FlexFlow::optimizer_tensor_t>>"
diff --git a/lib/local-execution/include/local-execution/cost_details.struct.toml b/lib/local-execution/include/local-execution/cost_details.struct.toml
deleted file mode 100644
index d17438b9ff..0000000000
--- a/lib/local-execution/include/local-execution/cost_details.struct.toml
+++ /dev/null
@@ -1,18 +0,0 @@
-namespace = "FlexFlow"
-name = "CostDetails"
-features = [
-  "eq",
-  "ord",
-  "hash",
-  "json",
-  "rapidcheck",
-  "fmt",
-]
-
-[[fields]]
-name = "total_elapsed_time"
-type = "float"
-
-[[fields]]
-name = "total_mem_usage"
-type = "size_t"
diff --git a/lib/local-execution/include/local-execution/cost_estimate.h b/lib/local-execution/include/local-execution/cost_estimate.h
deleted file mode 100644
index 7020089ccf..0000000000
--- a/lib/local-execution/include/local-execution/cost_estimate.h
+++ /dev/null
@@ -1,63 +0,0 @@
-
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_COST_ESTIMATE_H
-#define _FLEXFLOW_LOCAL_EXECUTION_COST_ESTIMATE_H
-
-#include "local-execution/cost_details.dtg.h"
-#include "local-execution/local_training_backing.h"
-#include "op-attrs/parallel_tensor_shape.dtg.h"
-#include "op-attrs/pcg_operator_attrs.dtg.h"
-#include "pcg/machine_view.h"
-#include "pcg/parallel_computation_graph/parallel_tensor_attrs.dtg.h"
-namespace FlexFlow {
-
-struct ICostEstimator {
-  virtual CostDetails
-      estimate_cost(PCGOperatorAttrs const &op,
-                    std::vector<ParallelTensorShape> const &inputs,
-                    std::vector<ParallelTensorAttrs> const &weights,
-                    std::vector<ParallelTensorAttrs> const &outputs,
-                    MachineView const &mv) const = 0;
-  virtual float estimate_cost(ParallelTensorShape const &tensor_shape,
-                              MachineView const &src,
-                              MachineView const &dst) const = 0;
-
-  ICostEstimator() = default;
-  ICostEstimator(ICostEstimator const &) = delete;
-  ICostEstimator &operator=(ICostEstimator const &) = delete;
-
-  virtual ~ICostEstimator() = default;
-};
-CHECK_RC_COPY_VIRTUAL_COMPLIANT(ICostEstimator);
-
-struct CostEstimator {
-  CostDetails estimate_cost(PCGOperatorAttrs const &op,
-                            std::vector<ParallelTensorShape> const &inputs,
-                            std::vector<ParallelTensorAttrs> const &weights,
-                            std::vector<ParallelTensorAttrs> const &outputs,
-                            MachineView const &mv) const {
-    return this->implementation_ptr->estimate_cost(
-        op, inputs, weights, outputs, mv);
-  }
-
-  float estimate_cost(ParallelTensorShape const &tensor_shape,
-                      MachineView const &src,
-                      MachineView const &dst) const {
-    return this->implementation_ptr->estimate_cost(tensor_shape, src, dst);
-  }
-
-  template <typename T, typename... Args>
-  static typename std::enable_if<std::is_base_of<ICostEstimator, T>::value,
-                                 CostEstimator>::type
-      create(Args &&...args) {
-    return CostEstimator(std::make_shared<T>(std::forward<Args>(args)...));
-  }
-
-private:
-  CostEstimator(std::shared_ptr<ICostEstimator> implementation_ptr)
-      : implementation_ptr(implementation_ptr) {}
-  std::shared_ptr<ICostEstimator> implementation_ptr;
-};
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/cost_metrics.h b/lib/local-execution/include/local-execution/cost_metrics.h
deleted file mode 100644
index edc0190daf..0000000000
--- a/lib/local-execution/include/local-execution/cost_metrics.h
+++ /dev/null
@@ -1,70 +0,0 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_COST_METRICS_H
-#define _FLEXFLOW_LOCAL_EXECUTION_COST_METRICS_H
-
-#include "utils/visitable.h"
-
-namespace FlexFlow {
-
-/**
- * @brief Costs of an operator.
- */
-struct CostMetrics : public use_visitable_cmp<CostMetrics> {
-  CostMetrics() = delete;
-  CostMetrics(float forward_time,
-              float backward_time,
-              float sync_type,
-              size_t inputs_memory,
-              size_t outputs_memory,
-              size_t weights_memory);
-  /**
-   * @brief Return the sum of inputs_memory, outputs_memory, and weights_memory
-   * recorded in this CostMetrics.
-   */
-  size_t total_memory() const;
-
-  /**
-   * @brief Return the sum of memory recorded in this CostMetrics, but in MB,
-   * instead of Bytes.
-   */
-  float total_memory_in_mb() const;
-
-  /**
-   * @brief Get the incremental difference between the total memory in
-   * CostMetrics and sim->offset.
-   * @details This is to easily compute the difference between sim->offset and
-   * sum of all memory usage recorded in this CostMetrics.
-   *
-   * @param sim_offset Simulator->offset
-   * @return size_t The incremental memory usage difference
-   */
-  size_t total_mem_diff_from(off_t sim_offset) const;
-
-public:
-  float forward_time;
-  float backward_time;
-  float sync_time;
-  ///< Bytes of memory usage of different parts
-  // Assume:
-  // 1. all memory allocations use Simulator::allocate
-  // 2. we call Simulator::free_all before measuring an operator
-  // Therefore, the current memory usage of an operator is (size_t)sim->offset
-  size_t inputs_memory;
-  size_t outputs_memory;
-  size_t weights_memory;
-  ///< Memory usage of Op* considering parallelization over devices
-  size_t op_total_mem;
-};
-
-} // namespace FlexFlow
-
-VISITABLE_STRUCT(::FlexFlow::CostMetrics,
-                 forward_time,
-                 backward_time,
-                 sync_time,
-                 inputs_memory,
-                 outputs_memory,
-                 weights_memory,
-                 op_total_mem);
-MAKE_VISIT_HASHABLE(::FlexFlow::CostMetrics);
-
-#endif
diff --git a/lib/local-execution/include/local-execution/gradient_tensor_source.h b/lib/local-execution/include/local-execution/gradient_tensor_source.h
deleted file mode 100644
index d724859712..0000000000
--- a/lib/local-execution/include/local-execution/gradient_tensor_source.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_GRADIENT_TENSOR_SOURCE_H
-#define _FLEXFLOW_LOCAL_EXECUTION_GRADIENT_TENSOR_SOURCE_H
-
-#include "task-spec/gradient_tensor_t.dtg.h"
-
-namespace FlexFlow {
-
-struct GradientTensorSource {
-public:
-  GradientTensorSource();
-
-  gradient_tensor_t new_gradient_tensor();
-
-  void reset();
-
-private:
-  static size_t next_available_gradient_tensor_id;
-};
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/local_args_backing.h b/lib/local-execution/include/local-execution/local_args_backing.h
index e9044dc6fa..1e9a30d293 100644
--- a/lib/local-execution/include/local-execution/local_args_backing.h
+++ b/lib/local-execution/include/local-execution/local_args_backing.h
@@ -1,37 +1,37 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ARGS_BACKING_H
 #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ARGS_BACKING_H
 
+#include "local-execution/local_args_backing.dtg.h"
 #include "local-execution/local_task_argument_accessor.h"
+#include "local-execution/local_task_registry.dtg.h"
+#include "local-execution/local_tensor_backing.dtg.h"
 #include "pcg/computation_graph.h"
-#include "pcg/layer_guid_t.dtg.h"
-#include "task-spec/op_task_invocation.h"
 #include "task-spec/per_device_op_state.h"
-#include "task-spec/runtime_arg_config.h"
+#include "task-spec/task_binding.h"
 #include "task-spec/task_invocation.dtg.h"
+#include "task-spec/training_computation_graph.dtg.h"
+#include "task-spec/training_layer_plus_context.dtg.h"
 
 namespace FlexFlow {
 
-struct LocalArgsBacking {
-  LocalArgsBacking(
-      RuntimeArgConfig const &,
-      std::unordered_map<layer_guid_t, DeviceSpecificDeviceStates> const &);
-
-public:
-  // arguments
-  RuntimeArgConfig runtime_arg_config;
-  std::unordered_map<layer_guid_t, DeviceSpecificDeviceStates>
-      per_device_op_states;
-};
-
-LocalArgsBacking
-    make_args_backing_with_empty_device_states(RuntimeArgConfig const &);
+LocalArgsBacking make_local_computation_args_backing_with_empty_device_states(
+    RuntimeArgConfig const &);
 
 std::optional<DeviceSpecificDeviceStates>
     get_per_device_op_state_if_exists(LocalArgsBacking const &,
                                       layer_guid_t const &);
 
-ArgSlotsBacking construct_arg_slots_backing(TaskBinding const &,
-                                            RuntimeArgConfig const &);
+std::unordered_map<slot_id_t, ConcreteArgSpec>
+    construct_arg_slots_backing(TaskBinding const &, RuntimeArgConfig const &);
+
+TaskArgumentAccessor get_task_arg_accessor(LocalTensorBacking const &,
+                                           RuntimeArgConfig const &,
+                                           TaskInvocation const &,
+                                           Allocator &);
+
+LocalArgsBacking make_local_args_backing_for_computation_graph(
+    RuntimeArgConfig const &,
+    std::unordered_map<layer_guid_t, std::optional<DeviceSpecificDeviceStates>> const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/local-execution/include/local-execution/local_args_backing.struct.toml b/lib/local-execution/include/local-execution/local_args_backing.struct.toml
new file mode 100644
index 0000000000..449f883194
--- /dev/null
+++ b/lib/local-execution/include/local-execution/local_args_backing.struct.toml
@@ -0,0 +1,18 @@
+namespace = "FlexFlow"
+name = "LocalArgsBacking"
+features = []
+
+includes = [
+  "task-spec/runtime_arg_config.dtg.h",
+  "task-spec/device_specific_device_states.dtg.h",
+  "pcg/layer_guid_t.dtg.h",
+  "<unordered_map>",
+]
+
+[[fields]]
+name = "runtime_arg_config"
+type = "::FlexFlow::RuntimeArgConfig"
+
+[[fields]]
+name = "per_device_op_states"
+type = "std::unordered_map<::FlexFlow::layer_guid_t, std::optional<::FlexFlow::DeviceSpecificDeviceStates>>"
diff --git a/lib/local-execution/include/local-execution/local_cost_estimator.h b/lib/local-execution/include/local-execution/local_cost_estimator.h
index 0189475fcb..c42876bbd6 100644
--- a/lib/local-execution/include/local-execution/local_cost_estimator.h
+++ b/lib/local-execution/include/local-execution/local_cost_estimator.h
@@ -1,26 +1,22 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_COST_ESTIMATOR_H
 #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_COST_ESTIMATOR_H
 
-#include "local-execution/cost_estimate.h"
-#include "task-spec/runtime_arg_config.h"
+#include "compiler/cost_estimator/cost_estimator.h"
+#include "pcg/optimizer_attrs.dtg.h"
+#include "task-spec/runtime_arg_config.dtg.h"
 
 namespace FlexFlow {
 
 struct LocalCostEstimator : public ICostEstimator {
   LocalCostEstimator(RuntimeArgConfig const &);
+
   LocalCostEstimator(LocalCostEstimator const &) = delete;
   LocalCostEstimator(LocalCostEstimator &&) = delete;
   ~LocalCostEstimator() = default;
 
-  CostDetails estimate_cost(PCGOperatorAttrs const &op,
-                            std::vector<ParallelTensorShape> const &inputs,
-                            std::vector<ParallelTensorAttrs> const &weights,
-                            std::vector<ParallelTensorAttrs> const &outputs,
-                            MachineView const &mv) const override;
+  OpCostMetrics estimate_cost(OpCostEstimateKey const &) const override;
 
-  float estimate_cost(ParallelTensorShape const &tensor_shape,
-                      MachineView const &src,
-                      MachineView const &dst) const override;
+  milliseconds_t estimate_cost(TensorSetMovement const &) const override;
 
 private:
   RuntimeArgConfig runtime_arg_config;
diff --git a/lib/local-execution/include/local-execution/local_task_argument_accessor.h b/lib/local-execution/include/local-execution/local_task_argument_accessor.h
index 184bf0b559..0ab66234eb 100644
--- a/lib/local-execution/include/local-execution/local_task_argument_accessor.h
+++ b/lib/local-execution/include/local-execution/local_task_argument_accessor.h
@@ -1,22 +1,21 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H
 #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H
 
-#include "task-spec/slot_tensor_type_id.dtg.h"
+#include "local-execution/tensor_slot_backing.dtg.h"
+#include "task-spec/runtime_arg_config.dtg.h"
 #include "task-spec/task_argument_accessor.h"
+#include "task-spec/tensor_sub_slot_id_t.dtg.h"
 #include <unordered_map>
 #include <variant>
 
 namespace FlexFlow {
 
-using TensorSlotsBacking = std::unordered_map<
-    SlotTensorTypeId,
-    std::variant<GenericTensorAccessorW, std::vector<GenericTensorAccessorW>>>;
-using ArgSlotsBacking = std::unordered_map<slot_id_t, ConcreteArgSpec>;
-
 struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor {
-  LocalTaskArgumentAccessor(Allocator const &allocator,
-                            TensorSlotsBacking const &tensor_slots_backing,
-                            ArgSlotsBacking const &arg_slots_backing);
+  explicit LocalTaskArgumentAccessor(
+      Allocator const &allocator,
+      std::unordered_map<tensor_sub_slot_id_t, TensorSlotBacking> const
+          &tensor_slots_backing,
+      std::unordered_map<slot_id_t, ConcreteArgSpec> const &arg_slots_backing);
 
   LocalTaskArgumentAccessor(LocalTaskArgumentAccessor const &) = delete;
   LocalTaskArgumentAccessor(LocalTaskArgumentAccessor &&) = delete;
@@ -35,8 +34,9 @@ struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor {
 
 private:
   Allocator allocator;
-  TensorSlotsBacking tensor_slots_backing;
-  ArgSlotsBacking arg_slots_backing;
+  std::unordered_map<tensor_sub_slot_id_t, TensorSlotBacking>
+      tensor_slots_backing;
+  std::unordered_map<slot_id_t, ConcreteArgSpec> arg_slots_backing;
 };
 
 CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalTaskArgumentAccessor);
diff --git a/lib/local-execution/include/local-execution/local_task_registry.h b/lib/local-execution/include/local-execution/local_task_registry.h
new file mode 100644
index 0000000000..142433ba53
--- /dev/null
+++ b/lib/local-execution/include/local-execution/local_task_registry.h
@@ -0,0 +1,24 @@
+#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_REGISTRY_H
+#define _FLEXFLOW_LOCAL_EXECUTION_TASK_REGISTRY_H
+
+#include "local-execution/local_task_registry.dtg.h"
+#include "local-execution/registered_task_t.dtg.h"
+#include "pcg/layer_attrs.dtg.h"
+#include "task-spec/op_task_type.dtg.h"
+#include "utils/units/milliseconds_t.h"
+
+namespace FlexFlow {
+
+LocalTaskRegistry construct_local_task_registry_for_layers(
+    std::unordered_map<layer_guid_t, LayerAttrs> const &);
+
+std::optional<registered_task_t> try_get_registered_task(
+    LocalTaskRegistry const &, layer_guid_t const &, OpTaskType const &);
+
+std::optional<milliseconds_t> call_task_impl(LocalTaskRegistry const &,
+                                             task_id_t const &task_id,
+                                             TaskArgumentAccessor const &acc);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/local-execution/include/local-execution/local_task_registry.struct.toml b/lib/local-execution/include/local-execution/local_task_registry.struct.toml
new file mode 100644
index 0000000000..84abc7aa0c
--- /dev/null
+++ b/lib/local-execution/include/local-execution/local_task_registry.struct.toml
@@ -0,0 +1,26 @@
+namespace = "FlexFlow"
+name = "LocalTaskRegistry"
+features = [
+  "eq",
+  "fmt",
+  "hash"
+]
+
+includes = [
+  "task-spec/task_signature_impl.dtg.h",
+  "pcg/layer_guid_t.dtg.h",
+  "local-execution/operator_task_set.dtg.h"
+]
+
+src_includes = [
+  "utils/hash/unordered_map.h",
+  "utils/fmt/unordered_map.h",
+]
+
+[[fields]]
+name = "task_sets"
+type = "std::unordered_map<::FlexFlow::layer_guid_t, ::FlexFlow::OperatorTaskSet>"
+
+[[fields]]
+name = "task_mapping"
+type = "std::unordered_map<::FlexFlow::task_id_t, ::FlexFlow::TaskSignatureAndImpl>"
diff --git a/lib/local-execution/include/local-execution/local_tensor_backing.h b/lib/local-execution/include/local-execution/local_tensor_backing.h
index f6168f2fb1..479ad4734a 100644
--- a/lib/local-execution/include/local-execution/local_tensor_backing.h
+++ b/lib/local-execution/include/local-execution/local_tensor_backing.h
@@ -1,46 +1,30 @@
-
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TENSOR_BACKING_H
 #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TENSOR_BACKING_H
 
 #include "kernels/accessor.h"
-#include "local-execution/allocated_tensors.dtg.h"
-#include "local-execution/gradient_tensor_source.h"
-#include "local-execution/local_task_argument_accessor.h"
+#include "kernels/allocation.h"
 #include "local-execution/local_tensor_backing.dtg.h"
-#include "local-execution/loss_tensor_source.h"
-#include "local-execution/optimizer_tensor_source.h"
-#include "local-execution/unallocated_tensors.dtg.h"
-#include "pcg/computation_graph.dtg.h"
-#include "pcg/layer_guid_t.dtg.h"
-#include "pcg/optimizer_attrs.dtg.h"
-#include "task-spec/lowered_tensor_t.dtg.h"
-#include "task-spec/task_invocation.dtg.h"
-#include "task-spec/tensor_role.dtg.h"
+#include "local-execution/tensor_slot_backing.dtg.h"
+#include "task-spec/task_binding.h"
+#include "task-spec/training_computation_graph.dtg.h"
+#include "task-spec/training_tensor_guid_t.dtg.h"
 
 namespace FlexFlow {
 
-GenericTensorAccessorW get_tensor(LocalTensorBacking const &,
-                                  TensorTypeVariant const &);
-
-std::unordered_map<TensorTypeVariant, GenericTensorAccessorW>
-    get_tensor_backings(
-        std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> const &,
-        std::unordered_map<TensorTypeVariant, TensorShape> const &,
-        Allocator &);
-
-std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
-    merge_optimizer_mappings(
-        std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>> const
-            &allocated,
-        std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>> const
-            &unallocated);
+LocalTensorBacking construct_local_tensor_backing(
+    std::unordered_map<training_tensor_guid_t, TensorShape> const
+        &training_tensor_shapes,
+    std::unordered_map<training_tensor_guid_t, GenericTensorAccessorW> const
+        &preallocated_tensors,
+    Allocator &);
 
-LocalTensorBacking construct_local_tensor_backing(AllocatedTensors const &,
-                                                  UnallocatedTensors const &,
-                                                  Allocator &);
+GenericTensorAccessorW
+    get_accessor_for_training_tensor(LocalTensorBacking const &,
+                                     training_tensor_guid_t);
 
-TensorSlotsBacking construct_tensor_slots_backing(LocalTensorBacking const &,
-                                                  TaskBinding const &);
+std::unordered_map<tensor_sub_slot_id_t, TensorSlotBacking>
+    construct_tensor_slots_backing_for_binding(LocalTensorBacking const &,
+                                               TaskBinding const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml b/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml
index bd59ec325d..48a7a7fa90 100644
--- a/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml
+++ b/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml
@@ -6,26 +6,14 @@ features = [
 ]
 
 includes = [
-  "task-spec/tensor_type_t.dtg.h",
   "kernels/accessor.h",
-  "pcg/tensor_guid_t.dtg.h",
-  "task-spec/gradient_tensor_t.dtg.h",
-  "task-spec/optimizer_tensor_t.dtg.h",
+  "task-spec/training_tensor_guid_t.dtg.h",
 ]
 
 src_includes = [
   "utils/fmt/unordered_map.h",
-  "utils/fmt/vector.h",
 ]
 
 [[fields]]
-name = "tensor_backings"
-type = "std::unordered_map<::FlexFlow::TensorTypeVariant, ::FlexFlow::GenericTensorAccessorW>"
-
-[[fields]]
-name = "tensor_gradient_mapping"
-type = "std::unordered_map<::FlexFlow::tensor_guid_t, ::FlexFlow::gradient_tensor_t>"
-
-[[fields]]
-name = "tensor_optimizer_mapping"
-type = "std::unordered_map<::FlexFlow::tensor_guid_t, std::vector<::FlexFlow::optimizer_tensor_t>>"
+name = "backing_for_training_tensor_map"
+type = "std::unordered_map<::FlexFlow::training_tensor_guid_t, ::FlexFlow::GenericTensorAccessorW>"
diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h
index addac74633..5484adef75 100644
--- a/lib/local-execution/include/local-execution/local_training_backing.h
+++ b/lib/local-execution/include/local-execution/local_training_backing.h
@@ -1,70 +1,50 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H
-#define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H
+#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H
+#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H
 
-#include "local-execution/allocated_tensors.dtg.h"
-#include "local-execution/local_args_backing.h"
-#include "local-execution/local_tensor_backing.h"
-#include "local-execution/optimizer_tensor_source.h"
-#include "local-execution/task_registry.h"
+#include "local-execution/local_training_backing.dtg.h"
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
-#include "pcg/computation_graph.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
+#include "task-spec/training_computation_graph.dtg.h"
+#include "task-spec/training_tensor_guid_t.dtg.h"
+#include "utils/containers/generate_map.h"
+#include "utils/units/milliseconds_t.h"
 
 namespace FlexFlow {
 
-struct LocalTrainingBacking {
-  LocalTrainingBacking(Allocator &,
-                       AllocatedTensors const &,
-                       GradientTensorSource &,
-                       ComputationGraph const &,
-                       RuntimeArgConfig const &);
+LocalTrainingBacking make_local_training_backing_for_computation_graph(
+    Allocator &allocator,
+    std::unordered_map<training_tensor_guid_t, GenericTensorAccessorW> const
+        &preallocated_tensors,
+    TrainingComputationGraph const &training_computation_graph,
+    RuntimeArgConfig const &runtime_arg_config,
+    OptimizerAttrs const &optimizer_attrs);
+
+std::optional<DeviceSpecificDeviceStates>
+    create_per_device_op_state(LocalTaskRegistry const &,
+                               LocalTensorBacking const &,
+                               RuntimeArgConfig const &,
+                               Allocator &,
+                               TrainingLayerPlusContext const &);
+
+std::optional<milliseconds_t> execute_forward(LocalTaskRegistry const &,
+                                              LocalTensorBacking const &,
+                                              LocalArgsBacking const &,
+                                              TrainingLayerPlusContext const &,
+                                              Allocator &);
+
+std::optional<milliseconds_t> execute_backward(LocalTaskRegistry const &,
+                                               LocalTensorBacking const &,
+                                               LocalArgsBacking const &,
+                                               TrainingLayerPlusContext const &,
+                                               Allocator &);
+
+void compute_loss(LocalTrainingBacking const &, LossAttrs const &, Allocator &);
 
-  LocalTrainingBacking(Allocator &,
-                       AllocatedTensors const &,
-                       GradientTensorSource &,
-                       OptimizerTensorSource &,
-                       ComputationGraph const &,
-                       RuntimeArgConfig const &,
-                       OptimizerAttrs const &);
-
-public:
-  ComputationGraph computation_graph;
-  TaskRegistry task_registry;
-  LocalTensorBacking local_tensor_backing;
-  LocalArgsBacking local_args_backing;
-};
-
-LocalArgsBacking initialize_args_backing(TaskRegistry const &,
-                                         ComputationGraph const &,
-                                         RuntimeArgConfig const &,
-                                         LocalTensorBacking const &,
-                                         Allocator &);
-
-std::optional<float> call_task_impl(TaskRegistry const &,
-                                    task_id_t const &task_id,
-                                    TaskArgumentAccessor const &acc);
-
-std::optional<float> execute_forward(LocalTrainingBacking const &,
-                                     layer_guid_t const &,
-                                     Allocator &);
-std::optional<float> execute_backward(LocalTrainingBacking const &,
-                                      layer_guid_t const &,
-                                      Allocator &);
-void compute_loss(LocalTrainingBacking const &,
-                  LossAttrs const &,
-                  tensor_guid_t const &logit_tensor,
-                  loss_tensor_t const &label_tensor,
-                  Allocator &);
 void execute_update(LocalTrainingBacking const &,
                     layer_guid_t const &,
                     OptimizerAttrs const &,
                     Allocator &);
 
-TaskArgumentAccessor get_task_arg_accessor(LocalTensorBacking const &,
-                                           LocalArgsBacking const &,
-                                           TaskInvocation const &,
-                                           Allocator &);
-
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/local-execution/include/local-execution/local_training_backing.struct.toml b/lib/local-execution/include/local-execution/local_training_backing.struct.toml
new file mode 100644
index 0000000000..7da8c3bed6
--- /dev/null
+++ b/lib/local-execution/include/local-execution/local_training_backing.struct.toml
@@ -0,0 +1,26 @@
+namespace = "FlexFlow"
+name = "LocalTrainingBacking"
+features = []
+
+includes = [
+  "task-spec/training_computation_graph.dtg.h",
+  "local-execution/local_task_registry.h",
+  "local-execution/local_tensor_backing.h",
+  "local-execution/local_args_backing.h",
+]
+
+[[fields]]
+name = "training_computation_graph"
+type = "::FlexFlow::TrainingComputationGraph"
+
+[[fields]]
+name = "local_task_registry"
+type = "::FlexFlow::LocalTaskRegistry"
+
+[[fields]]
+name = "local_tensor_backing"
+type = "::FlexFlow::LocalTensorBacking"
+
+[[fields]]
+name = "local_args_backing"
+type = "::FlexFlow::LocalArgsBacking"
diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h
index 6f8f4b1543..bfd279fde5 100644
--- a/lib/local-execution/include/local-execution/model_training_instance.h
+++ b/lib/local-execution/include/local-execution/model_training_instance.h
@@ -4,31 +4,24 @@
 #include "local-execution/local_training_backing.h"
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
 #include "pcg/tensor_guid_t.dtg.h"
-#include "task-spec/loss_tensor_t.dtg.h"
+#include "task-spec/loss_tensor_guid_t.dtg.h"
 
 namespace FlexFlow {
 
-using PerLayerElapsedTime =
-    std::unordered_map<layer_guid_t, std::optional<float>>;
-
 struct ModelTrainingInstance {
   ModelTrainingInstance(Allocator const &,
                         LocalTrainingBacking const &,
-                        tensor_guid_t const &logit_tensor,
-                        loss_tensor_t const &label_tensor,
                         LossAttrs const &,
                         OptimizerAttrs const &);
 
   Allocator allocator;
   LocalTrainingBacking training_backing;
-  tensor_guid_t logit_tensor;
-  loss_tensor_t label_tensor;
   LossAttrs loss_attrs;
   OptimizerAttrs optimizer_attrs;
 
 public:
-  PerLayerElapsedTime forward();
-  PerLayerElapsedTime backward();
+  std::unordered_map<layer_guid_t, std::optional<milliseconds_t>> forward();
+  std::unordered_map<layer_guid_t, std::optional<milliseconds_t>> backward();
   void update();
   GenericTensorAccessorR get_loss_tensor_accessor() const;
 };
diff --git a/lib/local-execution/include/local-execution/operator_task_set.h b/lib/local-execution/include/local-execution/operator_task_set.h
new file mode 100644
index 0000000000..bbe9da5d7f
--- /dev/null
+++ b/lib/local-execution/include/local-execution/operator_task_set.h
@@ -0,0 +1,24 @@
+#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPERATOR_TASK_SET_H
+#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPERATOR_TASK_SET_H
+
+#include "local-execution/operator_task_set.dtg.h"
+#include "op-attrs/computation_graph_op_attrs.dtg.h"
+#include "task-spec/op_task_type.dtg.h"
+#include "utils/bidict/bidict.h"
+
+namespace FlexFlow {
+
+bidict<OpTaskType, registered_task_t>
+    get_map_from_task_type_to_task(OperatorTaskSet const &);
+std::unordered_set<registered_task_t>
+    get_all_tasks_in_task_set(OperatorTaskSet const &);
+
+registered_task_t get_task_for_task_type(OperatorTaskSet const &op_task_set,
+                                         OpTaskType task_type);
+
+OperatorTaskSet
+    get_task_set_for_operator(ComputationGraphOpAttrs const &op_attrs);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/local-execution/include/local-execution/operator_task_set.struct.toml b/lib/local-execution/include/local-execution/operator_task_set.struct.toml
new file mode 100644
index 0000000000..dda2a1478d
--- /dev/null
+++ b/lib/local-execution/include/local-execution/operator_task_set.struct.toml
@@ -0,0 +1,24 @@
+namespace = "FlexFlow"
+name = "OperatorTaskSet"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "local-execution/registered_task_t.dtg.h"
+]
+
+[[fields]]
+name = "init_task"
+type = "::FlexFlow::registered_task_t"
+
+[[fields]]
+name = "fwd_task"
+type = "::FlexFlow::registered_task_t"
+
+[[fields]]
+name = "bwd_task"
+type = "::FlexFlow::registered_task_t"
diff --git a/lib/local-execution/include/local-execution/optimizer_tensor_source.h b/lib/local-execution/include/local-execution/optimizer_tensor_source.h
deleted file mode 100644
index b2b3d94ba5..0000000000
--- a/lib/local-execution/include/local-execution/optimizer_tensor_source.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_OPTIMIZER_TENSOR_SOURCE_H
-#define _FLEXFLOW_LOCAL_EXECUTION_OPTIMIZER_TENSOR_SOURCE_H
-
-#include "task-spec/optimizer_tensor_t.dtg.h"
-
-namespace FlexFlow {
-
-struct OptimizerTensorSource {
-public:
-  OptimizerTensorSource();
-
-  optimizer_tensor_t new_optimizer_tensor();
-
-  void reset();
-
-private:
-  static size_t next_available_optimizer_tensor_id;
-};
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/registered_task.h b/lib/local-execution/include/local-execution/registered_task.h
new file mode 100644
index 0000000000..d6e8a87b18
--- /dev/null
+++ b/lib/local-execution/include/local-execution/registered_task.h
@@ -0,0 +1,12 @@
+#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_REGISTERED_TASK_H
+#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_REGISTERED_TASK_H
+
+#include "local-execution/registered_task_t.dtg.h"
+
+namespace FlexFlow {
+
+registered_task_t make_noop_registered_task();
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/local-execution/include/local-execution/registered_task_t.variant.toml b/lib/local-execution/include/local-execution/registered_task_t.variant.toml
new file mode 100644
index 0000000000..d4bab60ec9
--- /dev/null
+++ b/lib/local-execution/include/local-execution/registered_task_t.variant.toml
@@ -0,0 +1,27 @@
+namespace = "FlexFlow"
+name = "registered_task_t"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "fmt",
+  "rapidcheck",
+]
+
+includes = [
+  "task-spec/task_id_t.dtg.h",
+  "<utility>",
+]
+
+src_includes = [
+  "utils/rapidcheck/monostate.h",
+  "utils/fmt/monostate.h",
+]
+
+[[values]]
+type = "::FlexFlow::task_id_t"
+key = "real_task"
+
+[[values]]
+type = "std::monostate"
+key = "noop_task"
diff --git a/lib/local-execution/include/local-execution/task_registry.h b/lib/local-execution/include/local-execution/task_registry.h
deleted file mode 100644
index eb3e0859d0..0000000000
--- a/lib/local-execution/include/local-execution/task_registry.h
+++ /dev/null
@@ -1,21 +0,0 @@
-
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_REGISTRY_H
-#define _FLEXFLOW_LOCAL_EXECUTION_TASK_REGISTRY_H
-
-#include "local-execution/task_registry.dtg.h"
-#include "op-attrs/computation_graph_op_attrs.h"
-#include "pcg/computation_graph.dtg.h"
-#include "task-spec/op_task_type.dtg.h"
-
-namespace FlexFlow {
-
-TaskRegistry construct_task_registry(
-    std::unordered_map<layer_guid_t, LayerAttrs> const &);
-
-bool registry_contains_task_for_layer(TaskRegistry const &,
-                                      layer_guid_t const &,
-                                      OpTaskType const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/task_registry.struct.toml b/lib/local-execution/include/local-execution/task_registry.struct.toml
deleted file mode 100644
index f5daa62090..0000000000
--- a/lib/local-execution/include/local-execution/task_registry.struct.toml
+++ /dev/null
@@ -1,35 +0,0 @@
-namespace = "FlexFlow"
-name = "TaskRegistry"
-features = [
-  "eq",
-  "fmt",
-  "hash"
-]
-
-includes = [
-  "task-spec/task_signature_impl.dtg.h",
-  "task-spec/task_id_t.dtg.h",
-  "pcg/layer_guid_t.dtg.h",
-]
-
-src_includes = [
-  "utils/hash/unordered_map.h",
-  "utils/fmt/unordered_map.h",
-  "utils/fmt/optional.h",
-]
-
-[[fields]]
-name = "init_task_ids"
-type = "std::unordered_map<::FlexFlow::layer_guid_t, std::optional<::FlexFlow::task_id_t>>"
-
-[[fields]]
-name = "forward_task_ids"
-type = "std::unordered_map<::FlexFlow::layer_guid_t, std::optional<::FlexFlow::task_id_t>>"
-
-[[fields]]
-name = "backward_task_ids"
-type = "std::unordered_map<::FlexFlow::layer_guid_t, std::optional<::FlexFlow::task_id_t>>"
-
-[[fields]]
-name = "task_mapping"
-type = "std::unordered_map<::FlexFlow::task_id_t, ::FlexFlow::TaskSignatureAndImpl>"
diff --git a/lib/local-execution/include/local-execution/tensor_slot_backing.variant.toml b/lib/local-execution/include/local-execution/tensor_slot_backing.variant.toml
new file mode 100644
index 0000000000..434988fa21
--- /dev/null
+++ b/lib/local-execution/include/local-execution/tensor_slot_backing.variant.toml
@@ -0,0 +1,23 @@
+namespace = "FlexFlow"
+name = "TensorSlotBacking"
+features = [
+  "eq",
+  "fmt",
+]
+
+includes = [
+  "kernels/accessor.h",
+  "<vector>",
+]
+
+src_includes = [
+  "utils/fmt/vector.h",
+]
+
+[[values]]
+type = "::FlexFlow::GenericTensorAccessorW"
+key = "single"
+
+[[values]]
+type = "std::vector<::FlexFlow::GenericTensorAccessorW>"
+key = "variadic"
diff --git a/lib/local-execution/include/local-execution/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h
index f697337c52..0b531f9b3d 100644
--- a/lib/local-execution/include/local-execution/tracked_allocator.h
+++ b/lib/local-execution/include/local-execution/tracked_allocator.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_LOCAL_EXECUTION_TRACKED_ALLOCATOR_H
 
 #include "kernels/allocation.h"
+#include "utils/units/num_bytes_t.h"
 
 namespace FlexFlow {
 
@@ -16,7 +17,7 @@ struct TrackedAllocator : public IAllocator {
 
   DeviceType get_allocation_device_type() const override;
 
-  size_t get_current_mem_usage();
+  num_bytes_t get_current_mem_usage() const;
 
 private:
   size_t current_mem_usage = 0;
diff --git a/lib/local-execution/include/local-execution/unallocated_tensors.h b/lib/local-execution/include/local-execution/unallocated_tensors.h
deleted file mode 100644
index 63ead67589..0000000000
--- a/lib/local-execution/include/local-execution/unallocated_tensors.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_UNALLOCATED_TENSORS_H
-#define _FLEXFLOW_LOCAL_EXECUTION_UNALLOCATED_TENSORS_H
-
-#include "local-execution/allocated_tensors.dtg.h"
-#include "local-execution/gradient_tensor_source.h"
-#include "local-execution/optimizer_tensor_source.h"
-#include "local-execution/unallocated_tensors.dtg.h"
-#include "pcg/optimizer_attrs.dtg.h"
-#include "pcg/tensor_attrs.dtg.h"
-
-namespace FlexFlow {
-
-UnallocatedTensors generate_unallocated_tensors(
-    AllocatedTensors const &,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &,
-    GradientTensorSource &);
-
-UnallocatedTensors generate_unallocated_tensors_with_optimizer(
-    AllocatedTensors const &,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &,
-    GradientTensorSource &,
-    OptimizerTensorSource &,
-    OptimizerAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/unallocated_tensors.struct.toml b/lib/local-execution/include/local-execution/unallocated_tensors.struct.toml
deleted file mode 100644
index e86cc2a532..0000000000
--- a/lib/local-execution/include/local-execution/unallocated_tensors.struct.toml
+++ /dev/null
@@ -1,31 +0,0 @@
-namespace = "FlexFlow"
-name = "UnallocatedTensors"
-features = [
-  "eq",
-  "fmt",
-  "hash",
-]
-
-includes = [
-  "task-spec/tensor_type_t.dtg.h",
-  "op-attrs/tensor_shape.dtg.h"
-]
-
-src_includes = [
-  "utils/hash/unordered_map.h",
-  "utils/fmt/unordered_map.h",
-  "utils/hash/vector.h",
-  "utils/fmt/vector.h"
-]
-
-[[fields]]
-name = "tensor_type_shapes"
-type = "std::unordered_map<::FlexFlow::TensorTypeVariant, ::FlexFlow::TensorShape>"
-
-[[fields]]
-name = "gradient_mapping"
-type = "std::unordered_map<::FlexFlow::tensor_guid_t, ::FlexFlow::gradient_tensor_t>"
-
-[[fields]]
-name = "optimizer_mapping"
-type = "std::unordered_map<::FlexFlow::tensor_guid_t, std::vector<::FlexFlow::optimizer_tensor_t>>"
diff --git a/lib/local-execution/src/allocated_tensors.cc b/lib/local-execution/src/allocated_tensors.cc
deleted file mode 100644
index ffaeaf285f..0000000000
--- a/lib/local-execution/src/allocated_tensors.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-#include "local-execution/allocated_tensors.h"
-#include "pcg/optimizer_attrs.h"
-#include "utils/containers/keys.h"
-#include "utils/containers/set_union.h"
-
-namespace FlexFlow {
-
-bool is_allocated_tensor_backing_valid(
-    TensorTypeVariant const &tensor_type,
-    std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> const
-        &allocated_tensor_backings,
-    ArrayShape const &expected_shape) {
-  if (allocated_tensor_backings.count(tensor_type)) {
-    GenericTensorAccessorW tensor_backing =
-        allocated_tensor_backings.at(tensor_type);
-    if (expected_shape == tensor_backing.shape) {
-      return true;
-    }
-  }
-  return false;
-};
-
-bool are_allocated_forward_tensors_valid(
-    AllocatedTensors const &allocated_tensors,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs) {
-
-  std::unordered_set<tensor_guid_t> all_tensor_guids = transform(
-      keys(filter_keys(
-          allocated_tensors.tensor_type_backings,
-          [&](TensorTypeVariant const &k) { return k.has<tensor_guid_t>(); })),
-      [&](TensorTypeVariant const &t) { return t.get<tensor_guid_t>(); });
-
-  for (tensor_guid_t const &tensor_guid : all_tensor_guids) {
-    if (tensor_attrs.count(tensor_guid)) {
-      if (!is_allocated_tensor_backing_valid(
-              TensorTypeVariant{tensor_guid},
-              allocated_tensors.tensor_type_backings,
-              array_shape_from_tensor_shape(
-                  tensor_attrs.at(tensor_guid).shape))) {
-        return false;
-      }
-    } else {
-      return false;
-    }
-  }
-  return true;
-}
-
-bool are_allocated_gradient_tensors_valid(
-    AllocatedTensors const &allocated_tensors,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs) {
-  std::unordered_set<TensorTypeVariant>
-      tensors_in_mappings; // will check for dangling gradient tensors
-
-  for (std::pair<tensor_guid_t, gradient_tensor_t> const &tensor_to_grad :
-       allocated_tensors.gradient_mapping) {
-    if (tensor_attrs.count(tensor_to_grad.first)) {
-      if (tensor_attrs.at(tensor_to_grad.first).create_grad == CreateGrad::NO) {
-        return false;
-      }
-
-      ArrayShape tensor_guid_array_shape = array_shape_from_tensor_shape(
-          tensor_attrs.at(tensor_to_grad.first).shape);
-      TensorTypeVariant gradient_tensor =
-          TensorTypeVariant{tensor_to_grad.second};
-      if (is_allocated_tensor_backing_valid(
-              gradient_tensor,
-              allocated_tensors.tensor_type_backings,
-              tensor_guid_array_shape)) {
-        tensors_in_mappings.insert(gradient_tensor);
-      } else {
-        return false;
-      }
-    } else {
-      return false;
-    }
-  }
-
-  for (TensorTypeVariant const &tensor_type :
-       keys(allocated_tensors.tensor_type_backings)) {
-    if (tensor_type.has<gradient_tensor_t>()) {
-      if (!tensors_in_mappings.count(tensor_type)) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-bool are_allocated_optimizer_tensors_valid(
-    AllocatedTensors const &allocated_tensors,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs) {
-  std::unordered_set<TensorTypeVariant>
-      tensors_in_mappings; // will check for dangling optimizer tensors
-
-  for (std::pair<tensor_guid_t, std::vector<optimizer_tensor_t>> const
-           &tensor_to_optimizers : allocated_tensors.optimizer_mapping) {
-    if (tensor_attrs.count(tensor_to_optimizers.first)) {
-      if (tensor_attrs.at(tensor_to_optimizers.first).create_grad ==
-          CreateGrad::NO) {
-        return false;
-      }
-
-      ArrayShape tensor_guid_array_shape = array_shape_from_tensor_shape(
-          tensor_attrs.at(tensor_to_optimizers.first).shape);
-      for (optimizer_tensor_t const &optimizer_tensor :
-           tensor_to_optimizers.second) {
-        if (is_allocated_tensor_backing_valid(
-                TensorTypeVariant{optimizer_tensor},
-                allocated_tensors.tensor_type_backings,
-                tensor_guid_array_shape)) {
-          tensors_in_mappings.insert(TensorTypeVariant{optimizer_tensor});
-        } else {
-          return false;
-        }
-      }
-    }
-  }
-
-  for (TensorTypeVariant const &tensor_type :
-       keys(allocated_tensors.tensor_type_backings)) {
-    if (tensor_type.has<optimizer_tensor_t>()) {
-      if (!tensors_in_mappings.count(tensor_type)) {
-        return false;
-      }
-    }
-  }
-
-  return true;
-}
-
-bool are_allocated_tensors_valid(
-    AllocatedTensors const &allocated_tensors,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs) {
-  return are_allocated_forward_tensors_valid(allocated_tensors, tensor_attrs) &&
-         are_allocated_gradient_tensors_valid(allocated_tensors,
-                                              tensor_attrs) &&
-         are_allocated_optimizer_tensors_valid(allocated_tensors, tensor_attrs);
-}
-
-AllocatedTensors make_empty_allocated_tensors() {
-  return AllocatedTensors{{}, {}, {}};
-}
-
-} // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/local_args_backing.cc b/lib/local-execution/src/local-execution/local_args_backing.cc
new file mode 100644
index 0000000000..a672b9d164
--- /dev/null
+++ b/lib/local-execution/src/local-execution/local_args_backing.cc
@@ -0,0 +1,62 @@
+#include "local-execution/local_args_backing.h"
+#include "local-execution/local_task_registry.h"
+#include "local-execution/local_tensor_backing.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "task-spec/op_task_to_task_invocation.h"
+#include "task-spec/task_signature_impl.h"
+#include "task-spec/training_computation_graph.h"
+#include "task-spec/training_layer_plus_context.h"
+#include "utils/containers/contains_key.h"
+#include "utils/containers/generate_map.h"
+#include "utils/containers/map_values.h"
+#include "utils/containers/try_at.h"
+#include "utils/overload.h"
+
+namespace FlexFlow {
+
+std::optional<DeviceSpecificDeviceStates> get_per_device_op_state_if_exists(
+    LocalArgsBacking const &local_args_backing,
+    layer_guid_t const &layer_guid) {
+
+  return local_args_backing.per_device_op_states.at(layer_guid);
+}
+
+std::unordered_map<slot_id_t, ConcreteArgSpec>
+    construct_arg_slots_backing(TaskBinding const &binding,
+                                RuntimeArgConfig const &runtime_arg_config) {
+  return map_values(
+      binding.get_arg_bindings(), [&](TaskArgSpec const &arg_binding) {
+        return arg_binding.template visit<ConcreteArgSpec>(
+            overload{[&](RuntimeArgRefSpec const &s) {
+                       return lower_to_concrete_arg_spec(s, runtime_arg_config);
+                     },
+                     [](ConcreteArgSpec const &s) { return s; }});
+      });
+  ;
+}
+
+TaskArgumentAccessor
+    get_task_arg_accessor(LocalTensorBacking const &local_tensor_backing,
+                          RuntimeArgConfig const &runtime_arg_config,
+                          TaskInvocation const &invocation,
+                          Allocator &allocator) {
+  std::unordered_map<tensor_sub_slot_id_t, TensorSlotBacking>
+      tensor_slots_backing = construct_tensor_slots_backing_for_binding(
+          local_tensor_backing, invocation.binding);
+  std::unordered_map<slot_id_t, ConcreteArgSpec> arg_slots_backing =
+      construct_arg_slots_backing(invocation.binding, runtime_arg_config);
+  return TaskArgumentAccessor::create<LocalTaskArgumentAccessor>(
+      allocator, tensor_slots_backing, arg_slots_backing);
+}
+
+LocalArgsBacking make_local_args_backing_for_computation_graph(
+    RuntimeArgConfig const &runtime_arg_config,
+    std::unordered_map<layer_guid_t, std::optional<DeviceSpecificDeviceStates>> const &
+        per_device_op_states) {
+  return LocalArgsBacking{
+      runtime_arg_config,
+      per_device_op_states,
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/local_cost_estimator.cc b/lib/local-execution/src/local-execution/local_cost_estimator.cc
new file mode 100644
index 0000000000..6517dbfdbc
--- /dev/null
+++ b/lib/local-execution/src/local-execution/local_cost_estimator.cc
@@ -0,0 +1,165 @@
+#include "local-execution/local_cost_estimator.h"
+#include "kernels/create_local_allocator_for_device_type.h"
+#include "kernels/device.h"
+#include "kernels/local_cpu_allocator.h"
+#include "kernels/local_cuda_allocator.h"
+#include "local-execution/local_training_backing.h"
+#include "local-execution/tracked_allocator.h"
+#include "op-attrs/computation_graph_op_attrs.h"
+#include "op-attrs/pcg_operator_attrs.h"
+#include "pcg/computation_graph.h"
+#include "pcg/computation_graph/layer_added_result.dtg.h"
+#include "pcg/machine_view.dtg.h"
+#include "pcg/parallel_tensor_attrs.h"
+#include "task-spec/forward_tensor_source.h"
+#include "task-spec/gradient_tensor_source.h"
+#include "task-spec/optimizer_tensor_source.h"
+#include "task-spec/training_computation_graph.h"
+#include "utils/containers/concat_vectors.h"
+#include "utils/containers/get_only.h"
+#include "utils/containers/sum.h"
+#include "utils/containers/transform.h"
+#include "utils/containers/values.h"
+
+namespace FlexFlow {
+
+LocalCostEstimator::LocalCostEstimator(RuntimeArgConfig const &config)
+    : runtime_arg_config(config) {}
+
+static TrainingComputationGraph
+    create_computation_graph_for_local_cost_estimation(
+        PCGOperatorAttrs const &op,
+        OptimizerAttrs const &optimizer_attrs,
+        std::vector<ParallelTensorShape> const &inputs,
+        std::vector<ParallelTensorShape> const &weights,
+        std::vector<ParallelTensorShape> const &outputs) {
+  ComputationGraph computation_graph = make_empty_computation_graph();
+
+  std::vector<tensor_guid_t> input_tensors;
+  for (ParallelTensorShape const &input : inputs) {
+    LayerAddedResult inputs_layer = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{InputAttrs{get_piece_shape(input)}},
+                   std::nullopt},
+        {},
+        {});
+    input_tensors.push_back(get_only(inputs_layer.outputs));
+  }
+
+  std::vector<tensor_guid_t> weight_tensors;
+  for (ParallelTensorShape const &weight : weights) {
+    LayerAddedResult weights_layer =
+        add_layer(computation_graph,
+                  LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
+                                 get_piece_shape(weight),
+                                 InitializerAttrs{ZeroInitializerAttrs{}}}},
+                             std::nullopt},
+                  {},
+                  {});
+    weight_tensors.push_back(get_only(weights_layer.outputs));
+  }
+
+  // create operator layer
+  LayerAddedResult operator_layer = add_layer(
+      computation_graph,
+      LayerAttrs{compgraph_op_attrs_from_pcg_op_attrs(op), "operator"},
+      input_tensors,
+      weight_tensors);
+
+  ForwardTensorSource forward_tensor_source;
+  GradientTensorSource gradient_tensor_source;
+  OptimizerTensorSource optimizer_tensor_source;
+  LossTensorSource loss_tensor_source;
+
+  TrainingComputationGraph training_cg = generate_training_computation_graph(
+      /*computation_graph=*/computation_graph,
+      /*optimizer_attrs=*/optimizer_attrs,
+      /*logit_tensor=*/operator_layer.outputs.at(0),
+      /*forward_tensor_source=*/forward_tensor_source,
+      /*gradient_tensor_source=*/gradient_tensor_source,
+      /*optimizer_tensor_source=*/optimizer_tensor_source,
+      /*loss_tensor_source=*/loss_tensor_source);
+
+  return training_cg;
+}
+
+OpCostMetrics LocalCostEstimator::estimate_cost(
+    OpCostEstimateKey const &op_cost_estimate_key) const {
+
+  PCGOperatorAttrs op = op_cost_estimate_key.op_attrs;
+  std::vector<ParallelTensorShape> inputs = op_cost_estimate_key.input_shapes;
+  std::vector<ParallelTensorShape> weights = op_cost_estimate_key.weight_shapes;
+  std::vector<ParallelTensorShape> outputs = op_cost_estimate_key.output_shapes;
+  MachineView mv = op_cost_estimate_key.machine_view;
+
+  if (is_parallel_op(op) || op.has<InputAttrs>() || op.has<NoopAttrs>() ||
+      op.has<WeightAttrs>()) {
+    return OpCostMetrics{
+        /*forward_runtime=*/0_ms,
+        /*backward_runtime=*/0_ms,
+        /*memory=*/0_bytes,
+    };
+  }
+
+  TrainingComputationGraph training_cg =
+      create_computation_graph_for_local_cost_estimation(
+          /*op=*/op,
+          /*optimizer_attrs=*/op_cost_estimate_key.optimizer_attrs,
+          /*inputs=*/inputs,
+          /*weights=*/weights,
+          /*outputs=*/outputs);
+
+  // allocate memory
+  std::shared_ptr<TrackedAllocator> tracked_allocator_ptr =
+      std::make_shared<TrackedAllocator>(create_local_allocator_for_device_type(
+          runtime_arg_config.kernel_device_type));
+  Allocator allocator = Allocator(tracked_allocator_ptr);
+
+  LocalTrainingBacking local_backing =
+      make_local_training_backing_for_computation_graph(
+          /*allocator=*/allocator,
+          /*preallocated_tensors=*/{},
+          /*training_computation_graph=*/training_cg,
+          /*runtime_arg_config=*/this->runtime_arg_config,
+          /*optimizer_attrs=*/op_cost_estimate_key.optimizer_attrs);
+
+  // execute layer
+  layer_guid_t operator_layer_guid =
+      get_layer_by_name(training_cg.computation_graph, "operator");
+
+  milliseconds_t fwd = execute_forward(local_backing.local_task_registry,
+                                       local_backing.local_tensor_backing,
+                                       local_backing.local_args_backing,
+                                       get_training_layer_plus_context(
+                                           training_cg, operator_layer_guid),
+                                       allocator)
+                           .value();
+  milliseconds_t bwd = execute_backward(local_backing.local_task_registry,
+                                        local_backing.local_tensor_backing,
+                                        local_backing.local_args_backing,
+                                        get_training_layer_plus_context(
+                                            training_cg, operator_layer_guid),
+                                        allocator)
+                           .value();
+
+  return OpCostMetrics{
+      /*forward_runtime=*/fwd,
+      /*backward_runtime=*/bwd,
+      /*memory=*/tracked_allocator_ptr->get_current_mem_usage(),
+  };
+}
+
+milliseconds_t LocalCostEstimator::estimate_cost(
+    TensorSetMovement const &tensor_set_movement) const {
+  // TODO: model communication cost analytically
+  // https://github.com/flexflow/FlexFlow/issues/1414
+
+  NOT_IMPLEMENTED();
+}
+
+CostEstimator
+    get_local_cost_estimator(RuntimeArgConfig const &runtime_arg_config) {
+  return CostEstimator::create<LocalCostEstimator>(runtime_arg_config);
+}
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/local_task_registry.cc b/lib/local-execution/src/local-execution/local_task_registry.cc
new file mode 100644
index 0000000000..d482736a5b
--- /dev/null
+++ b/lib/local-execution/src/local-execution/local_task_registry.cc
@@ -0,0 +1,64 @@
+#include "local-execution/local_task_registry.h"
+#include "local-execution/operator_task_set.h"
+#include "local-execution/registered_task.h"
+#include "pcg/computation_graph.h"
+#include "task-spec/task_signature_impl.h"
+#include "utils/containers/contains_key.h"
+#include "utils/containers/filtrans.h"
+#include "utils/containers/flatmap.h"
+#include "utils/containers/generate_map.h"
+#include "utils/containers/map_values.h"
+#include "utils/containers/try_at.h"
+#include "utils/containers/values.h"
+
+namespace FlexFlow {
+
+LocalTaskRegistry construct_local_task_registry_for_layers(
+    std::unordered_map<layer_guid_t, LayerAttrs> const &layer_attrs_mapping) {
+
+  std::unordered_map<layer_guid_t, OperatorTaskSet> task_sets =
+      map_values(layer_attrs_mapping, [](LayerAttrs const &layer_attrs) {
+        return get_task_set_for_operator(layer_attrs.op_attrs);
+      });
+
+  std::unordered_set<registered_task_t> all_tasks =
+      flatmap(unordered_set_of(values(task_sets)), get_all_tasks_in_task_set);
+
+  std::unordered_set<task_id_t> all_real_tasks =
+      filtrans(all_tasks, [](registered_task_t const &t) {
+        return t.try_require_real_task();
+      });
+
+  std::unordered_map<task_id_t, TaskSignatureAndImpl> task_mapping =
+      generate_map(all_real_tasks, get_task_signature_and_impl_for_task_id);
+
+  return LocalTaskRegistry{
+      /*task_sets=*/task_sets,
+      /*task_mapping=*/task_mapping,
+  };
+}
+
+std::optional<registered_task_t>
+    try_get_registered_task(LocalTaskRegistry const &task_registry,
+                            layer_guid_t const &layer_guid,
+                            OpTaskType const &op_task_type) {
+  if (!contains_key(task_registry.task_sets, layer_guid)) {
+    return std::nullopt;
+  }
+
+  return get_task_for_task_type(task_registry.task_sets.at(layer_guid),
+                                op_task_type);
+}
+
+std::optional<milliseconds_t>
+    call_task_impl(LocalTaskRegistry const &task_registry,
+                   task_id_t const &task_id,
+                   TaskArgumentAccessor const &acc) {
+  TaskSignatureAndImpl task_sig_impl = task_registry.task_mapping.at(task_id);
+  auto fn =
+      task_sig_impl.impl_function.get<FwdBwdOpTaskImplFunction>().function_ptr;
+  return transform(
+      fn(acc), [](float running_time) { return milliseconds_t{running_time}; });
+}
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/local_tensor_backing.cc b/lib/local-execution/src/local-execution/local_tensor_backing.cc
new file mode 100644
index 0000000000..be8e44736c
--- /dev/null
+++ b/lib/local-execution/src/local-execution/local_tensor_backing.cc
@@ -0,0 +1,74 @@
+#include "local-execution/local_tensor_backing.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "pcg/computation_graph.h"
+#include "pcg/optimizer_attrs.h"
+#include "task-spec/slot_grad_id.dtg.h"
+#include "task-spec/training_computation_graph.h"
+#include "utils/containers/contains_key.h"
+#include "utils/containers/generate_map.h"
+#include "utils/containers/is_submapeq_of.h"
+#include "utils/containers/is_subseteq_of.h"
+#include "utils/containers/keys.h"
+#include "utils/containers/map_values.h"
+#include "utils/containers/merge_maps.h"
+#include "utils/containers/set_minus.h"
+#include "utils/containers/set_of.h"
+#include "utils/overload.h"
+
+namespace FlexFlow {
+
+LocalTensorBacking construct_local_tensor_backing(
+    std::unordered_map<training_tensor_guid_t, TensorShape> const
+        &training_tensor_shapes,
+    std::unordered_map<training_tensor_guid_t, GenericTensorAccessorW> const
+        &preallocated,
+    Allocator &allocator) {
+
+  ASSERT(is_subseteq_of(keys(preallocated), keys(training_tensor_shapes)));
+
+  std::unordered_set<training_tensor_guid_t> to_allocate =
+      set_minus(keys(training_tensor_shapes), keys(preallocated));
+
+  std::unordered_map<training_tensor_guid_t, GenericTensorAccessorW> allocated =
+      generate_map(to_allocate, [&](training_tensor_guid_t t) {
+        TensorShape shape = training_tensor_shapes.at(t);
+        return allocator.allocate_tensor(shape);
+      });
+
+  std::unordered_map<training_tensor_guid_t, GenericTensorAccessorW>
+      backing_for_training_tensor_map =
+          merge_disjoint_maps(allocated, preallocated);
+
+  ASSERT(is_submapeq_of(preallocated, backing_for_training_tensor_map));
+
+  ASSERT(keys(backing_for_training_tensor_map) == keys(training_tensor_shapes),
+         backing_for_training_tensor_map.size(),
+         training_tensor_shapes.size(),
+         keys(preallocated));
+
+  return LocalTensorBacking{
+      backing_for_training_tensor_map,
+  };
+}
+
+GenericTensorAccessorW get_accessor_for_training_tensor(
+    LocalTensorBacking const &local_tensor_backing,
+    training_tensor_guid_t training_tensor) {
+  return local_tensor_backing.backing_for_training_tensor_map.at(
+      training_tensor);
+}
+
+std::unordered_map<tensor_sub_slot_id_t, TensorSlotBacking>
+    construct_tensor_slots_backing_for_binding(
+        LocalTensorBacking const &local_tensor_backing,
+        TaskBinding const &binding) {
+
+  return map_values(
+      binding.get_tensor_bindings(), [&](training_tensor_guid_t t) {
+        return TensorSlotBacking{
+            get_accessor_for_training_tensor(local_tensor_backing, t),
+        };
+      });
+}
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/local_training_backing.cc b/lib/local-execution/src/local-execution/local_training_backing.cc
new file mode 100644
index 0000000000..9c67d3acd3
--- /dev/null
+++ b/lib/local-execution/src/local-execution/local_training_backing.cc
@@ -0,0 +1,221 @@
+#include "local-execution/local_training_backing.h"
+#include "local-execution/local_args_backing.h"
+#include "pcg/computation_graph.h"
+#include "pcg/optimizer_attrs.h"
+#include "task-spec/loss_functions.h"
+#include "task-spec/op_task_to_task_invocation.h"
+#include "task-spec/optimizer.h"
+#include "task-spec/task_invocation.h"
+#include "task-spec/task_signature_impl.h"
+#include "task-spec/training_computation_graph.h"
+#include "utils/containers/contains.h"
+#include "utils/containers/contains_key.h"
+#include "utils/containers/get_only.h"
+#include "utils/containers/is_subseteq_of.h"
+#include "utils/containers/keys.h"
+#include "utils/containers/values.h"
+#include "utils/exception.h"
+
+namespace FlexFlow {
+
+LocalTrainingBacking make_local_training_backing_for_computation_graph(
+    Allocator &allocator,
+    std::unordered_map<training_tensor_guid_t, GenericTensorAccessorW> const
+        &preallocated,
+    TrainingComputationGraph const &training_computation_graph,
+    RuntimeArgConfig const &runtime_arg_config,
+    OptimizerAttrs const &optimizer_attrs) {
+
+  ASSERT(is_subseteq_of(
+      keys(preallocated),
+      keys(get_all_training_tensor_shapes(training_computation_graph))));
+
+  LocalTaskRegistry local_task_registry =
+      construct_local_task_registry_for_layers(get_layer_attrs_mapping(
+          training_computation_graph.computation_graph));
+
+  LocalTensorBacking local_tensor_backing = construct_local_tensor_backing(
+      get_all_training_tensor_shapes(training_computation_graph),
+      preallocated,
+      allocator);
+
+  std::unordered_map<layer_guid_t, std::optional<DeviceSpecificDeviceStates>>
+      per_device_op_states = generate_map(
+          topological_ordering(training_computation_graph.computation_graph),
+          [&](layer_guid_t const &layer_guid) {
+            return create_per_device_op_state(
+                local_task_registry,
+                local_tensor_backing,
+                runtime_arg_config,
+                allocator,
+                get_training_layer_plus_context(training_computation_graph,
+                                                layer_guid));
+          });
+
+  LocalArgsBacking local_args_backing =
+      make_local_args_backing_for_computation_graph(runtime_arg_config,
+                                                    per_device_op_states);
+
+  return LocalTrainingBacking{
+      /*computation_graph=*/training_computation_graph,
+      /*local_task_registry=*/local_task_registry,
+      /*local_tensor_backing=*/local_tensor_backing,
+      /*local_args_backing=*/local_args_backing,
+  };
+}
+
+std::optional<DeviceSpecificDeviceStates>
+    create_per_device_op_state(LocalTaskRegistry const &local_task_registry,
+                               LocalTensorBacking const &tensor_backing,
+                               RuntimeArgConfig const &runtime_arg_config,
+                               Allocator &allocator,
+                               TrainingLayerPlusContext const &training_layer) {
+  std::optional maybe_registered_task = try_get_registered_task(
+      local_task_registry, training_layer.layer_guid, OpTaskType::INIT);
+
+  ASSERT(maybe_registered_task.has_value());
+
+  registered_task_t registered_task = maybe_registered_task.value();
+  if (registered_task.is_noop_task()) {
+    return std::nullopt;
+  }
+
+  TaskInvocation invocation = lower_to_task_invocation(
+      /*op_task_invocation=*/get_init_op_task_invocation(
+          training_layer.layer_attrs.op_attrs),
+      /*training_layer=*/training_layer,
+      /*device_specific_device_states=*/std::nullopt);
+
+  TaskArgumentAccessor accessor = get_task_arg_accessor(
+      tensor_backing, runtime_arg_config, invocation, allocator);
+  TaskSignatureAndImpl task_sig_impl =
+      local_task_registry.task_mapping.at(invocation.task_id);
+  auto fn =
+      task_sig_impl.impl_function.get<InitOpTaskImplFunction>().function_ptr;
+  DeviceSpecificDeviceStates device_state = fn(accessor);
+  return device_state;
+}
+
+std::optional<milliseconds_t>
+    execute_forward(LocalTaskRegistry const &local_task_registry,
+                    LocalTensorBacking const &local_tensor_backing,
+                    LocalArgsBacking const &local_args_backing,
+                    TrainingLayerPlusContext const &training_layer,
+                    Allocator &allocator) {
+
+  std::optional maybe_registered_task = try_get_registered_task(
+      local_task_registry, training_layer.layer_guid, OpTaskType::BWD);
+
+  ASSERT(maybe_registered_task.has_value());
+
+  registered_task_t registered_task = maybe_registered_task.value();
+  if (registered_task.is_noop_task()) {
+    return std::nullopt;
+  }
+
+  std::optional<DeviceSpecificDeviceStates> device_state =
+      get_per_device_op_state_if_exists(local_args_backing,
+                                        training_layer.layer_guid);
+
+  TaskInvocation invocation = lower_to_task_invocation(
+      /*op_task_invocation=*/get_forward_op_task_invocation(
+          training_layer.layer_attrs.op_attrs),
+      /*training_layer=*/training_layer,
+      /*device_specific_device_states=*/device_state);
+
+  TaskArgumentAccessor accessor =
+      get_task_arg_accessor(local_tensor_backing,
+                            local_args_backing.runtime_arg_config,
+                            invocation,
+                            allocator);
+  return call_task_impl(local_task_registry, invocation.task_id, accessor);
+}
+
+void compute_loss(LocalTrainingBacking const &local_training_backing,
+                  LossAttrs const &loss_attrs,
+                  Allocator &allocator) {
+
+  TrainingComputationGraph training_cg =
+      local_training_backing.training_computation_graph;
+  tensor_guid_t logit_tensor = training_cg.logit_tensor;
+  loss_tensor_guid_t label_tensor = training_cg.label_tensor;
+
+  TaskInvocation loss_invocation = backward(
+      loss_attrs,
+      get_forward_tensor_guid_for_tensor_guid(training_cg, logit_tensor),
+      get_gradient_tensor_guid_for_tensor_guid(training_cg, logit_tensor),
+      label_tensor);
+  // TODO: https://github.com/flexflow/flexflow-train/issues/1442
+  // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation));
+  TaskArgumentAccessor loss_accessor = get_task_arg_accessor(
+      local_training_backing.local_tensor_backing,
+      local_training_backing.local_args_backing.runtime_arg_config,
+      loss_invocation,
+      allocator);
+  TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl();
+  loss_impl_fn.get<GenericTaskImplFunction>().function_ptr(loss_accessor);
+}
+
+std::optional<milliseconds_t>
+    execute_backward(LocalTaskRegistry const &local_task_registry,
+                     LocalTensorBacking const &local_tensor_backing,
+                     LocalArgsBacking const &local_args_backing,
+                     TrainingLayerPlusContext const &training_layer,
+                     Allocator &allocator) {
+
+  std::optional maybe_registered_task = try_get_registered_task(
+      local_task_registry, training_layer.layer_guid, OpTaskType::BWD);
+
+  ASSERT(maybe_registered_task.has_value());
+
+  registered_task_t registered_task = maybe_registered_task.value();
+  if (registered_task.is_noop_task()) {
+    return std::nullopt;
+  }
+
+  std::optional<DeviceSpecificDeviceStates> device_state =
+      get_per_device_op_state_if_exists(local_args_backing,
+                                        training_layer.layer_guid);
+  TaskInvocation invocation = lower_to_task_invocation(
+      get_backward_op_task_invocation(training_layer.layer_attrs.op_attrs),
+      training_layer,
+      device_state);
+  TaskArgumentAccessor accessor =
+      get_task_arg_accessor(local_tensor_backing,
+                            local_args_backing.runtime_arg_config,
+                            invocation,
+                            allocator);
+  return call_task_impl(local_task_registry, invocation.task_id, accessor);
+}
+
+void execute_update(LocalTrainingBacking const &local_training_backing,
+                    layer_guid_t const &layer_guid,
+                    OptimizerAttrs const &optimizer_attrs,
+                    Allocator &allocator) {
+  TrainingLayerPlusContext training_layer = get_training_layer_plus_context(
+      local_training_backing.training_computation_graph, layer_guid);
+
+  if (training_layer.layer_attrs.op_attrs.has<WeightAttrs>()) {
+    TrainingTensorGroupWithAttrs weight_tensor_group =
+        get_only(training_layer.output_tensor_groups);
+
+    TaskInvocation invocation =
+        get_update_invocation(optimizer_attrs,
+                              weight_tensor_group.forward_tensor,
+                              weight_tensor_group.gradient_tensor,
+                              weight_tensor_group.optimizer_tensors);
+
+    // TODO: https://github.com/flexflow/flexflow-train/issues/1442
+    // assert(is_invocation_valid(get_update_signature(attrs), invocation));
+
+    TaskArgumentAccessor accessor = get_task_arg_accessor(
+        local_training_backing.local_tensor_backing,
+        local_training_backing.local_args_backing.runtime_arg_config,
+        invocation,
+        allocator);
+    TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs);
+    update_impl_fn.get<GenericTaskImplFunction>().function_ptr(accessor);
+  }
+}
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/model_training_instance.cc b/lib/local-execution/src/local-execution/model_training_instance.cc
new file mode 100644
index 0000000000..be2791a365
--- /dev/null
+++ b/lib/local-execution/src/local-execution/model_training_instance.cc
@@ -0,0 +1,85 @@
+#include "local-execution/model_training_instance.h"
+#include "pcg/computation_graph.h"
+#include "pcg/optimizer_attrs.h"
+#include "task-spec/training_computation_graph.h"
+#include "utils/containers/reversed.h"
+
+namespace FlexFlow {
+
+ModelTrainingInstance::ModelTrainingInstance(
+    Allocator const &allocator,
+    LocalTrainingBacking const &local_training_backing,
+    LossAttrs const &loss_attrs,
+    OptimizerAttrs const &optimizer_attrs)
+    : allocator(allocator), training_backing(local_training_backing),
+      loss_attrs(loss_attrs), optimizer_attrs(optimizer_attrs) {}
+
+std::unordered_map<layer_guid_t, std::optional<milliseconds_t>>
+    ModelTrainingInstance::forward() {
+
+  std::unordered_map<layer_guid_t, std::optional<milliseconds_t>>
+      per_layer_elapsed_time;
+
+  for (layer_guid_t const &layer_guid :
+       topological_ordering(this->training_backing.training_computation_graph
+                                .computation_graph)) {
+    std::optional<milliseconds_t> elapsed_time = execute_forward(
+        this->training_backing.local_task_registry,
+        this->training_backing.local_tensor_backing,
+        this->training_backing.local_args_backing,
+        get_training_layer_plus_context(
+            this->training_backing.training_computation_graph, layer_guid),
+        this->allocator);
+
+    per_layer_elapsed_time.insert({layer_guid, elapsed_time});
+  }
+
+  return per_layer_elapsed_time;
+}
+
+std::unordered_map<layer_guid_t, std::optional<milliseconds_t>>
+    ModelTrainingInstance::backward() {
+  compute_loss(this->training_backing, this->loss_attrs, this->allocator);
+
+  std::unordered_map<layer_guid_t, std::optional<milliseconds_t>>
+      per_layer_elapsed_time;
+  for (layer_guid_t const &layer_guid : reversed(topological_ordering(
+           this->training_backing.training_computation_graph
+               .computation_graph))) {
+    std::optional<milliseconds_t> elapsed_time = execute_backward(
+        this->training_backing.local_task_registry,
+        this->training_backing.local_tensor_backing,
+        this->training_backing.local_args_backing,
+        get_training_layer_plus_context(
+            this->training_backing.training_computation_graph, layer_guid),
+        this->allocator);
+    per_layer_elapsed_time.insert({layer_guid, elapsed_time});
+  }
+  return per_layer_elapsed_time;
+}
+
+void ModelTrainingInstance::update() {
+  for (layer_guid_t const &layer_guid :
+       topological_ordering(this->training_backing.training_computation_graph
+                                .computation_graph)) {
+    execute_update(this->training_backing,
+                   layer_guid,
+                   this->optimizer_attrs,
+                   this->allocator);
+  }
+  this->optimizer_attrs =
+      get_optimizer_attrs_for_next_iter(this->optimizer_attrs);
+}
+
+GenericTensorAccessorR ModelTrainingInstance::get_loss_tensor_accessor() const {
+  gradient_tensor_guid_t loss_tensor = get_gradient_tensor_guid_for_tensor_guid(
+      this->training_backing.training_computation_graph,
+      this->training_backing.training_computation_graph.logit_tensor);
+  GenericTensorAccessorW loss_tensor_backing =
+      this->training_backing.local_tensor_backing
+          .backing_for_training_tensor_map.at(
+              training_tensor_guid_t{loss_tensor});
+  return read_only_accessor_from_write_accessor(loss_tensor_backing);
+}
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/operator_task_set.cc b/lib/local-execution/src/local-execution/operator_task_set.cc
new file mode 100644
index 0000000000..8dbc8791c6
--- /dev/null
+++ b/lib/local-execution/src/local-execution/operator_task_set.cc
@@ -0,0 +1,71 @@
+#include "local-execution/operator_task_set.h"
+#include "local-execution/registered_task.h"
+#include "task-spec/task_signature_impl.h"
+#include "utils/bidict/algorithms/right_entries.h"
+#include "utils/containers/values.h"
+
+namespace FlexFlow {
+
+bidict<OpTaskType, registered_task_t>
+    get_map_from_task_type_to_task(OperatorTaskSet const &op_task_set) {
+  return {
+      {OpTaskType::INIT, op_task_set.init_task},
+      {OpTaskType::FWD, op_task_set.fwd_task},
+      {OpTaskType::BWD, op_task_set.bwd_task},
+  };
+}
+
+std::unordered_set<registered_task_t>
+    get_all_tasks_in_task_set(OperatorTaskSet const &op_task_set) {
+  return right_entries(get_map_from_task_type_to_task(op_task_set));
+}
+
+registered_task_t get_task_for_task_type(OperatorTaskSet const &op_task_set,
+                                         OpTaskType task_type) {
+  return get_map_from_task_type_to_task(op_task_set).at_l(task_type);
+}
+
+OperatorTaskSet
+    get_task_set_for_operator(ComputationGraphOpAttrs const &attrs) {
+  registered_task_t init_task = make_noop_registered_task();
+  registered_task_t fwd_task = make_noop_registered_task();
+  registered_task_t bwd_task = make_noop_registered_task();
+
+  std::vector<task_id_t> task_ids = get_task_ids(attrs);
+
+  for (task_id_t const &task_id : task_ids) {
+    TaskSignatureAndImpl task_signature_and_impl =
+        get_task_signature_and_impl_for_task_id(task_id);
+
+    TaskImplFunction task_impl_function = task_signature_and_impl.impl_function;
+    OpTaskSignature task_signature = task_signature_and_impl.task_signature;
+
+    switch (task_signature.type) {
+      case OpTaskType::INIT:
+        ASSERT(is_invocation_valid(task_signature,
+                                   get_init_op_task_invocation(attrs)));
+        init_task = registered_task_t{task_id};
+        break;
+      case OpTaskType::FWD:
+        ASSERT(is_invocation_valid(task_signature,
+                                   get_forward_op_task_invocation(attrs)));
+        fwd_task = registered_task_t{task_id};
+        break;
+      case OpTaskType::BWD:
+        ASSERT(is_invocation_valid(task_signature,
+                                   get_backward_op_task_invocation(attrs)));
+        bwd_task = registered_task_t{task_id};
+        break;
+      default:
+        PANIC("Unhandled OpTaskType", fmt::to_string(task_signature.type));
+    }
+  }
+
+  return OperatorTaskSet{
+      /*init_task=*/init_task,
+      /*fwd_task=*/fwd_task,
+      /*bwd_task=*/bwd_task,
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/local-execution/registered_task.cc b/lib/local-execution/src/local-execution/registered_task.cc
new file mode 100644
index 0000000000..84b116273a
--- /dev/null
+++ b/lib/local-execution/src/local-execution/registered_task.cc
@@ -0,0 +1,9 @@
+#include "local-execution/registered_task.h"
+
+namespace FlexFlow {
+
+registered_task_t make_noop_registered_task() {
+  return registered_task_t{std::monostate{}};
+}
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/local_args_backing.cc b/lib/local-execution/src/local_args_backing.cc
deleted file mode 100644
index 4a342767b2..0000000000
--- a/lib/local-execution/src/local_args_backing.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-#include "local-execution/local_args_backing.h"
-#include "op-attrs/parallel_tensor_shape.h"
-#include "task-spec/op_task_to_task_invocation.h"
-#include "utils/containers/contains_key.h"
-#include "utils/containers/map_values.h"
-#include "utils/overload.h"
-
-namespace FlexFlow {
-
-LocalArgsBacking make_args_backing_with_empty_device_states(
-    RuntimeArgConfig const &runtime_arg_config) {
-  return LocalArgsBacking{runtime_arg_config, {}};
-}
-
-LocalArgsBacking::LocalArgsBacking(
-    RuntimeArgConfig const &runtime_arg_config,
-    std::unordered_map<layer_guid_t, DeviceSpecificDeviceStates> const
-        &device_states)
-    : runtime_arg_config(runtime_arg_config),
-      per_device_op_states(device_states){};
-
-std::optional<DeviceSpecificDeviceStates> get_per_device_op_state_if_exists(
-    LocalArgsBacking const &local_args_backing,
-    layer_guid_t const &layer_guid) {
-  if (contains_key(local_args_backing.per_device_op_states, layer_guid)) {
-    return local_args_backing.per_device_op_states.at(layer_guid);
-  } else {
-    return std::nullopt;
-  }
-}
-
-ArgSlotsBacking
-    construct_arg_slots_backing(TaskBinding const &binding,
-                                RuntimeArgConfig const &runtime_arg_config) {
-  return map_values(
-      binding.get_arg_bindings(), [&](TaskArgSpec const &arg_binding) {
-        return arg_binding.template visit<ConcreteArgSpec>(
-            overload{[&](RuntimeArgRefSpec const &s) {
-                       return lower_to_concrete_arg_spec(s, runtime_arg_config);
-                     },
-                     [](ConcreteArgSpec const &s) { return s; }});
-      });
-  ;
-}
-
-} // namespace FlexFlow
diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc
deleted file mode 100644
index 85f315c7d1..0000000000
--- a/lib/local-execution/src/local_cost_estimator.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-#include "local-execution/local_cost_estimator.h"
-#include "kernels/device.h"
-#include "kernels/local_cuda_allocator.h"
-#include "local-execution/tracked_allocator.h"
-#include "op-attrs/computation_graph_op_attrs.h"
-#include "op-attrs/pcg_operator_attrs.h"
-#include "pcg/computation_graph.h"
-#include "pcg/computation_graph/layer_added_result.dtg.h"
-#include "pcg/machine_view.dtg.h"
-#include "pcg/parallel_tensor_attrs.h"
-#include "utils/containers/concat_vectors.h"
-#include "utils/containers/get_only.h"
-#include "utils/containers/sum.h"
-#include "utils/containers/transform.h"
-#include "utils/containers/values.h"
-
-namespace FlexFlow {
-
-LocalCostEstimator::LocalCostEstimator(RuntimeArgConfig const &config)
-    : runtime_arg_config(config) {}
-
-static ComputationGraph create_computation_graph_for_local_cost_estimation(
-    PCGOperatorAttrs const &op,
-    std::vector<ParallelTensorShape> const &inputs,
-    std::vector<ParallelTensorAttrs> const &weights,
-    std::vector<ParallelTensorAttrs> const &outputs) {
-  ComputationGraph computation_graph = make_empty_computation_graph();
-
-  std::vector<tensor_guid_t> input_tensors;
-  for (ParallelTensorShape const &input : inputs) {
-    LayerAddedResult inputs_layer = add_layer(
-        computation_graph,
-        LayerAttrs{ComputationGraphOpAttrs{InputAttrs{get_piece_shape(input)}},
-                   std::nullopt},
-        {},
-        {});
-    input_tensors.push_back(get_only(inputs_layer.outputs));
-  }
-
-  std::vector<tensor_guid_t> weight_tensors;
-  for (ParallelTensorAttrs const &weight : weights) {
-    LayerAddedResult weights_layer =
-        add_layer(computation_graph,
-                  LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
-                                 get_piece_shape(weight.shape),
-                                 InitializerAttrs{ZeroInitializerAttrs{}}}},
-                             std::nullopt},
-                  {},
-                  {});
-    weight_tensors.push_back(get_only(weights_layer.outputs));
-  }
-
-  // create operator layer
-  LayerAddedResult operator_layer = add_layer(
-      computation_graph,
-      LayerAttrs{compgraph_op_attrs_from_pcg_op_attrs(op), "operator"},
-      input_tensors,
-      weight_tensors);
-
-  return computation_graph;
-}
-
-CostDetails LocalCostEstimator::estimate_cost(
-    PCGOperatorAttrs const &op,
-    std::vector<ParallelTensorShape> const &inputs,
-    std::vector<ParallelTensorAttrs> const &weights,
-    std::vector<ParallelTensorAttrs> const &outputs,
-    MachineView const &mv) const {
-
-  if (is_parallel_op(op) || op.has<InputAttrs>() || op.has<NoopAttrs>() ||
-      op.has<WeightAttrs>()) {
-    return CostDetails{0, 0};
-  }
-
-  // construct computation graph
-  ComputationGraph computation_graph =
-      create_computation_graph_for_local_cost_estimation(
-          op, inputs, weights, outputs);
-
-  // allocate memory
-  std::shared_ptr<TrackedAllocator> tracked_allocator_ptr =
-      std::make_shared<TrackedAllocator>(create_local_cuda_memory_allocator());
-  Allocator allocator = Allocator(tracked_allocator_ptr);
-
-  GradientTensorSource gradient_tensor_source;
-
-  LocalTrainingBacking local_backing(allocator,
-                                     AllocatedTensors{{}, {}, {}},
-                                     gradient_tensor_source,
-                                     computation_graph,
-                                     this->runtime_arg_config);
-  // execute layer
-  layer_guid_t operator_layer_guid =
-      get_layer_by_name(computation_graph, "operator");
-
-  float fwd =
-      execute_forward(local_backing, operator_layer_guid, allocator).value();
-  float bwd =
-      execute_backward(local_backing, operator_layer_guid, allocator).value();
-
-  float total_execution_time = fwd + bwd;
-
-  return CostDetails{total_execution_time,
-                     tracked_allocator_ptr->get_current_mem_usage()};
-}
-
-float LocalCostEstimator::estimate_cost(ParallelTensorShape const &tensor_shape,
-                                        MachineView const &src,
-                                        MachineView const &dst) const {
-  // TODO: model communication cost analytically
-  // https://github.com/flexflow/FlexFlow/issues/1414
-  // temporarily return 0
-
-  return 0.0;
-}
-
-CostEstimator
-    get_local_cost_estimator(RuntimeArgConfig const &runtime_arg_config) {
-  return CostEstimator::create<LocalCostEstimator>(runtime_arg_config);
-}
-
-} // namespace FlexFlow
diff --git a/lib/local-execution/src/local_task_argument_accessor.cc b/lib/local-execution/src/local_task_argument_accessor.cc
index 2e82378fdb..207305a8db 100644
--- a/lib/local-execution/src/local_task_argument_accessor.cc
+++ b/lib/local-execution/src/local_task_argument_accessor.cc
@@ -8,8 +8,9 @@ namespace FlexFlow {
 
 LocalTaskArgumentAccessor::LocalTaskArgumentAccessor(
     Allocator const &allocator,
-    TensorSlotsBacking const &tensor_slots_backing,
-    ArgSlotsBacking const &arg_slots_backing)
+    std::unordered_map<tensor_sub_slot_id_t, TensorSlotBacking> const
+        &tensor_slots_backing,
+    std::unordered_map<slot_id_t, ConcreteArgSpec> const &arg_slots_backing)
     : allocator(allocator), tensor_slots_backing(tensor_slots_backing),
       arg_slots_backing(arg_slots_backing){};
 
@@ -20,9 +21,10 @@ ConcreteArgSpec const &
 
 GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor(
     slot_id_t slot, Permissions priv, TensorType tensor_type) const {
-  SlotTensorTypeId slot_tensor_type = SlotTensorTypeId{slot, tensor_type};
-  auto tensor_backing = std::get<GenericTensorAccessorW>(
-      this->tensor_slots_backing.at(slot_tensor_type));
+  tensor_sub_slot_id_t slot_tensor_type =
+      tensor_sub_slot_id_t{slot, tensor_type};
+  GenericTensorAccessorW tensor_backing =
+      this->tensor_slots_backing.at(slot_tensor_type).require_single();
   if (priv == Permissions::RO) {
     GenericTensorAccessorR readonly_tensor_backing =
         read_only_accessor_from_write_accessor(tensor_backing);
@@ -30,15 +32,16 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor(
   } else if (priv == Permissions::RW || priv == Permissions::WO) {
     return tensor_backing;
   } else {
-    throw mk_runtime_error(fmt::format("Unhandled privilege mode {}", priv));
+    PANIC(fmt::format("Unhandled privilege mode {}", priv));
   }
 }
 
 VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor(
     slot_id_t slot, Permissions priv, TensorType tensor_type) const {
-  SlotTensorTypeId slot_tensor_type = SlotTensorTypeId{slot, tensor_type};
-  auto variadic_tensor_backing = std::get<std::vector<GenericTensorAccessorW>>(
-      this->tensor_slots_backing.at(slot_tensor_type));
+  tensor_sub_slot_id_t slot_tensor_type =
+      tensor_sub_slot_id_t{slot, tensor_type};
+  std::vector<GenericTensorAccessorW> variadic_tensor_backing =
+      this->tensor_slots_backing.at(slot_tensor_type).require_variadic();
   if (priv == Permissions::RO) {
     std::vector<GenericTensorAccessorR> readonly_variadic_tensor_backing = {};
     for (GenericTensorAccessorW const &tensor_backing :
@@ -50,7 +53,7 @@ VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor(
   } else if (priv == Permissions::RW || priv == Permissions::WO) {
     return variadic_tensor_backing;
   } else {
-    throw mk_runtime_error(fmt::format("Unhandled privilege mode {}", priv));
+    PANIC(fmt::format("Unhandled privilege mode {}", priv));
   }
 }
 
diff --git a/lib/local-execution/src/local_tensor_backing.cc b/lib/local-execution/src/local_tensor_backing.cc
deleted file mode 100644
index 629117508f..0000000000
--- a/lib/local-execution/src/local_tensor_backing.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-#include "local-execution/local_tensor_backing.h"
-#include "op-attrs/parallel_tensor_shape.h"
-#include "pcg/computation_graph.h"
-#include "pcg/optimizer_attrs.h"
-#include "task-spec/slot_grad_id.dtg.h"
-#include "utils/containers/contains_key.h"
-#include "utils/containers/keys.h"
-#include "utils/overload.h"
-
-namespace FlexFlow {
-
-GenericTensorAccessorW
-    get_tensor(LocalTensorBacking const &local_tensor_backing,
-               TensorTypeVariant const &tensor_type) {
-  return local_tensor_backing.tensor_backings.at(tensor_type);
-}
-
-std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
-    merge_optimizer_mappings(
-        std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>> const
-            &allocated,
-        std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>> const
-            &unallocated) {
-  std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
-      merged_maps = allocated;
-  for (std::pair<tensor_guid_t, std::vector<optimizer_tensor_t>> const
-           &unallocated_optimizer_tensors : unallocated) {
-    if (merged_maps.count(unallocated_optimizer_tensors.first)) {
-      for (optimizer_tensor_t const &optimizer_tensor :
-           unallocated_optimizer_tensors.second) {
-        merged_maps[unallocated_optimizer_tensors.first].push_back(
-            optimizer_tensor);
-      }
-    } else {
-      merged_maps.insert({unallocated_optimizer_tensors});
-    }
-  }
-  return merged_maps;
-}
-
-std::unordered_map<TensorTypeVariant, GenericTensorAccessorW>
-    get_tensor_backings(
-        std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> const
-            &tensor_type_backings,
-        std::unordered_map<TensorTypeVariant, TensorShape> const
-            &tensor_type_shapes,
-        Allocator &allocator) {
-  std::unordered_map<TensorTypeVariant, GenericTensorAccessorW>
-      all_tensor_backings = tensor_type_backings;
-
-  // allocate new tensors
-  for (std::pair<TensorTypeVariant, TensorShape> const &tensor_type_shape :
-       tensor_type_shapes) {
-    GenericTensorAccessorW tensor_backing =
-        allocator.allocate_tensor(tensor_type_shape.second);
-    all_tensor_backings.insert({tensor_type_shape.first, tensor_backing});
-  }
-
-  return all_tensor_backings;
-}
-
-LocalTensorBacking construct_local_tensor_backing(
-    AllocatedTensors const &allocated_tensors,
-    UnallocatedTensors const &unallocated_tensors,
-    Allocator &allocator) {
-
-  std::unordered_map<tensor_guid_t, gradient_tensor_t> merged_gradient_maps =
-      allocated_tensors.gradient_mapping;
-  merged_gradient_maps.insert(unallocated_tensors.gradient_mapping.begin(),
-                              unallocated_tensors.gradient_mapping.end());
-
-  return LocalTensorBacking{
-      get_tensor_backings(allocated_tensors.tensor_type_backings,
-                          unallocated_tensors.tensor_type_shapes,
-                          allocator),
-      merged_gradient_maps,
-      merge_optimizer_mappings(allocated_tensors.optimizer_mapping,
-                               unallocated_tensors.optimizer_mapping)};
-}
-
-TensorSlotsBacking construct_tensor_slots_backing(
-    LocalTensorBacking const &local_tensor_backing,
-    TaskBinding const &binding) {
-  TensorSlotsBacking mapping;
-
-  for (std::pair<SlotTensorTypeId, TensorTypeVariant> const &tensor_binding :
-       binding.get_tensor_bindings()) {
-    mapping.insert({tensor_binding.first,
-                    get_tensor(local_tensor_backing, tensor_binding.second)});
-  }
-
-  return mapping;
-}
-
-} // namespace FlexFlow
diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc
deleted file mode 100644
index 3b1bb0fd2d..0000000000
--- a/lib/local-execution/src/local_training_backing.cc
+++ /dev/null
@@ -1,264 +0,0 @@
-#include "local-execution/local_training_backing.h"
-#include "local-execution/loss_functions.h"
-#include "local-execution/optimizer.h"
-#include "local-execution/unallocated_tensors.h"
-#include "pcg/computation_graph.h"
-#include "pcg/optimizer_attrs.h"
-#include "task-spec/op_task_to_task_invocation.h"
-#include "task-spec/task_invocation.h"
-#include "task-spec/task_signature_impl.h"
-#include "utils/containers/contains.h"
-#include "utils/containers/contains_key.h"
-#include "utils/containers/get_only.h"
-#include "utils/containers/values.h"
-#include "utils/exception.h"
-
-namespace FlexFlow {
-
-LocalTrainingBacking::LocalTrainingBacking(
-    Allocator &allocator,
-    AllocatedTensors const &allocated_tensors,
-    GradientTensorSource &gradient_tensor_source,
-    ComputationGraph const &computation_graph,
-    RuntimeArgConfig const &runtime_arg_config)
-    : computation_graph(computation_graph),
-      task_registry(
-          construct_task_registry(get_layer_attrs_mapping(computation_graph))),
-      local_tensor_backing(construct_local_tensor_backing(
-          allocated_tensors,
-          generate_unallocated_tensors(allocated_tensors,
-                                       get_all_tensor_attrs(computation_graph),
-                                       gradient_tensor_source),
-          allocator)),
-      local_args_backing(initialize_args_backing(this->task_registry,
-                                                 computation_graph,
-                                                 runtime_arg_config,
-                                                 this->local_tensor_backing,
-                                                 allocator)){};
-
-LocalTrainingBacking::LocalTrainingBacking(
-    Allocator &allocator,
-    AllocatedTensors const &allocated_tensors,
-    GradientTensorSource &gradient_tensor_source,
-    OptimizerTensorSource &optimizer_tensor_source,
-    ComputationGraph const &computation_graph,
-    RuntimeArgConfig const &runtime_arg_config,
-    OptimizerAttrs const &optimizer_attrs)
-    : computation_graph(computation_graph),
-      task_registry(
-          construct_task_registry(get_layer_attrs_mapping(computation_graph))),
-      local_tensor_backing(construct_local_tensor_backing(
-          allocated_tensors,
-          generate_unallocated_tensors_with_optimizer(
-              allocated_tensors,
-              get_all_tensor_attrs(computation_graph),
-              gradient_tensor_source,
-              optimizer_tensor_source,
-              optimizer_attrs),
-          allocator)),
-      local_args_backing(initialize_args_backing(this->task_registry,
-                                                 computation_graph,
-                                                 runtime_arg_config,
-                                                 this->local_tensor_backing,
-                                                 allocator)){};
-LocalArgsBacking
-    initialize_args_backing(TaskRegistry const &task_registry,
-                            ComputationGraph const &cg,
-                            RuntimeArgConfig const &runtime_arg_config,
-                            LocalTensorBacking const &local_tensor_backing,
-                            Allocator &allocator) {
-  std::unordered_map<layer_guid_t, DeviceSpecificDeviceStates>
-      per_device_op_states;
-  for (layer_guid_t const &node : topological_ordering(cg)) {
-    if (registry_contains_task_for_layer(
-            task_registry, node, OpTaskType::INIT)) {
-      ComputationGraphOpAttrs attrs = get_layer_attrs(cg, node).op_attrs;
-
-      TaskInvocation invocation =
-          lower_to_task_invocation(init(attrs),
-                                   node,
-                                   get_incoming_inputs(cg, node),
-                                   get_incoming_input_shapes(cg, node),
-                                   get_outgoing_tensors(cg, node),
-                                   get_incoming_weights(cg, node),
-                                   local_tensor_backing.tensor_gradient_mapping,
-                                   std::nullopt);
-      TaskArgumentAccessor accessor = get_task_arg_accessor(
-          local_tensor_backing,
-          make_args_backing_with_empty_device_states(runtime_arg_config),
-          invocation,
-          allocator);
-      TaskSignatureAndImpl task_sig_impl =
-          task_registry.task_mapping.at(invocation.task_id);
-      auto fn = task_sig_impl.impl_function.get<InitOpTaskImplFunction>()
-                    .function_ptr;
-      DeviceSpecificDeviceStates device_state = fn(accessor);
-      per_device_op_states.insert({node, device_state});
-    }
-  }
-
-  return LocalArgsBacking{runtime_arg_config, per_device_op_states};
-}
-
-std::optional<float> call_task_impl(TaskRegistry const &task_registry,
-                                    task_id_t const &task_id,
-                                    TaskArgumentAccessor const &acc) {
-  TaskSignatureAndImpl task_sig_impl = task_registry.task_mapping.at(task_id);
-  auto fn =
-      task_sig_impl.impl_function.get<FwdBwdOpTaskImplFunction>().function_ptr;
-  return fn(acc);
-}
-
-std::optional<float>
-    execute_forward(LocalTrainingBacking const &local_training_backing,
-                    layer_guid_t const &operator_node,
-                    Allocator &allocator) {
-  if (registry_contains_task_for_layer(local_training_backing.task_registry,
-                                       operator_node,
-                                       OpTaskType::FWD)) {
-
-    ComputationGraphOpAttrs attrs =
-        get_layer_attrs(local_training_backing.computation_graph, operator_node)
-            .op_attrs;
-
-    std::optional<DeviceSpecificDeviceStates> device_state =
-        get_per_device_op_state_if_exists(
-            local_training_backing.local_args_backing, operator_node);
-
-    TaskInvocation invocation = lower_to_task_invocation(
-        forward(attrs),
-        operator_node,
-        get_incoming_inputs(local_training_backing.computation_graph,
-                            operator_node),
-        get_incoming_input_shapes(local_training_backing.computation_graph,
-                                  operator_node),
-        get_outgoing_tensors(local_training_backing.computation_graph,
-                             operator_node),
-        get_incoming_weights(local_training_backing.computation_graph,
-                             operator_node),
-        local_training_backing.local_tensor_backing.tensor_gradient_mapping,
-        device_state);
-    TaskArgumentAccessor accessor =
-        get_task_arg_accessor(local_training_backing.local_tensor_backing,
-                              local_training_backing.local_args_backing,
-                              invocation,
-                              allocator);
-    return call_task_impl(
-        local_training_backing.task_registry, invocation.task_id, accessor);
-  } else {
-    return std::nullopt;
-  }
-}
-
-void compute_loss(LocalTrainingBacking const &local_training_backing,
-                  LossAttrs const &loss_attrs,
-                  tensor_guid_t const &logit_tensor,
-                  loss_tensor_t const &label_tensor,
-                  Allocator &allocator) {
-  TaskInvocation loss_invocation = backward(
-      loss_attrs,
-      logit_tensor,
-      local_training_backing.local_tensor_backing.tensor_gradient_mapping.at(
-          logit_tensor),
-      label_tensor);
-  // TODO: https://github.com/flexflow/flexflow-train/issues/1442
-  // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation));
-  TaskArgumentAccessor loss_accessor =
-      get_task_arg_accessor(local_training_backing.local_tensor_backing,
-                            local_training_backing.local_args_backing,
-                            loss_invocation,
-                            allocator);
-  TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl();
-  loss_impl_fn.get<GenericTaskImplFunction>().function_ptr(loss_accessor);
-}
-
-std::optional<float>
-    execute_backward(LocalTrainingBacking const &local_training_backing,
-                     layer_guid_t const &operator_node,
-                     Allocator &allocator) {
-  if (registry_contains_task_for_layer(local_training_backing.task_registry,
-                                       operator_node,
-                                       OpTaskType::BWD)) {
-    ComputationGraphOpAttrs attrs =
-        get_layer_attrs(local_training_backing.computation_graph, operator_node)
-            .op_attrs;
-
-    std::optional<DeviceSpecificDeviceStates> device_state =
-        get_per_device_op_state_if_exists(
-            local_training_backing.local_args_backing, operator_node);
-    TaskInvocation invocation = lower_to_task_invocation(
-        backward(attrs),
-        operator_node,
-        get_incoming_inputs(local_training_backing.computation_graph,
-                            operator_node),
-        get_incoming_input_shapes(local_training_backing.computation_graph,
-                                  operator_node),
-        get_outgoing_tensors(local_training_backing.computation_graph,
-                             operator_node),
-        get_incoming_weights(local_training_backing.computation_graph,
-                             operator_node),
-        local_training_backing.local_tensor_backing.tensor_gradient_mapping,
-        device_state);
-    TaskArgumentAccessor accessor =
-        get_task_arg_accessor(local_training_backing.local_tensor_backing,
-                              local_training_backing.local_args_backing,
-                              invocation,
-                              allocator);
-    return call_task_impl(
-        local_training_backing.task_registry, invocation.task_id, accessor);
-  } else {
-    return std::nullopt;
-  }
-}
-
-void execute_update(LocalTrainingBacking const &local_training_backing,
-                    layer_guid_t const &node,
-                    OptimizerAttrs const &optimizer_attrs,
-                    Allocator &allocator) {
-  LayerAttrs layer_attrs =
-      get_layer_attrs(local_training_backing.computation_graph, node);
-  if (layer_attrs.op_attrs.has<WeightAttrs>()) {
-    // get tensors
-    tensor_guid_t weight_tensor = get_only(
-        get_outgoing_tensors(local_training_backing.computation_graph, node));
-    gradient_tensor_t weight_grad_tensor =
-        local_training_backing.local_tensor_backing.tensor_gradient_mapping.at(
-            weight_tensor);
-    std::vector<optimizer_tensor_t> optimizer_buffer_tensors =
-        local_training_backing.local_tensor_backing.tensor_optimizer_mapping.at(
-            weight_tensor);
-
-    // get invocation
-    TaskInvocation invocation = get_update_invocation(optimizer_attrs,
-                                                      weight_tensor,
-                                                      weight_grad_tensor,
-                                                      optimizer_buffer_tensors);
-
-    // TODO: https://github.com/flexflow/flexflow-train/issues/1442
-    // assert(is_invocation_valid(get_update_signature(attrs), invocation));
-
-    // execute update
-    TaskArgumentAccessor accessor =
-        get_task_arg_accessor(local_training_backing.local_tensor_backing,
-                              local_training_backing.local_args_backing,
-                              invocation,
-                              allocator);
-    TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs);
-    update_impl_fn.get<GenericTaskImplFunction>().function_ptr(accessor);
-  }
-}
-
-TaskArgumentAccessor
-    get_task_arg_accessor(LocalTensorBacking const &local_tensor_backing,
-                          LocalArgsBacking const &local_args_backing,
-                          TaskInvocation const &invocation,
-                          Allocator &allocator) {
-  TensorSlotsBacking tensor_slots_backing =
-      construct_tensor_slots_backing(local_tensor_backing, invocation.binding);
-  ArgSlotsBacking arg_slots_backing = construct_arg_slots_backing(
-      invocation.binding, local_args_backing.runtime_arg_config);
-  return TaskArgumentAccessor::create<LocalTaskArgumentAccessor>(
-      allocator, tensor_slots_backing, arg_slots_backing);
-}
-
-} // namespace FlexFlow
diff --git a/lib/local-execution/src/loss_tensor_source.cc b/lib/local-execution/src/loss_tensor_source.cc
deleted file mode 100644
index f5ce639087..0000000000
--- a/lib/local-execution/src/loss_tensor_source.cc
+++ /dev/null
@@ -1,13 +0,0 @@
-#include "local-execution/loss_tensor_source.h"
-
-namespace FlexFlow {
-
-nonnegative_int LossTensorSource::next_available_loss_tensor_id = 0_n;
-
-LossTensorSource::LossTensorSource() {}
-
-loss_tensor_t LossTensorSource::new_loss_tensor() {
-  return loss_tensor_t{LossTensorSource::next_available_loss_tensor_id++};
-}
-
-} // namespace FlexFlow
diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc
deleted file mode 100644
index d3c1c65a68..0000000000
--- a/lib/local-execution/src/model_training_instance.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-#include "local-execution/model_training_instance.h"
-#include "kernels/format_accessor_contents.h"
-#include "pcg/computation_graph.h"
-#include "pcg/optimizer_attrs.h"
-#include "utils/containers/reversed.h"
-
-namespace FlexFlow {
-
-ModelTrainingInstance::ModelTrainingInstance(
-    Allocator const &allocator,
-    LocalTrainingBacking const &local_training_backing,
-    tensor_guid_t const &logit_tensor,
-    loss_tensor_t const &label_tensor,
-    LossAttrs const &loss_attrs,
-    OptimizerAttrs const &optimizer_attrs)
-    : allocator(allocator), training_backing(local_training_backing),
-      loss_attrs(loss_attrs), optimizer_attrs(optimizer_attrs),
-      logit_tensor(logit_tensor), label_tensor(label_tensor){};
-
-PerLayerElapsedTime ModelTrainingInstance::forward() {
-  PerLayerElapsedTime per_layer_elapsed_time;
-  for (layer_guid_t const &node :
-       topological_ordering(this->training_backing.computation_graph)) {
-    std::optional<float> elapsed_time =
-        execute_forward(this->training_backing, node, this->allocator);
-    per_layer_elapsed_time.insert({node, elapsed_time});
-  }
-  return per_layer_elapsed_time;
-}
-
-PerLayerElapsedTime ModelTrainingInstance::backward() {
-  compute_loss(this->training_backing,
-               this->loss_attrs,
-               this->logit_tensor,
-               this->label_tensor,
-               this->allocator);
-
-  gradient_tensor_t loss_tensor =
-      this->training_backing.local_tensor_backing.tensor_gradient_mapping.at(
-          this->logit_tensor);
-  GenericTensorAccessorW loss_tensor_backing =
-      this->training_backing.local_tensor_backing.tensor_backings.at(
-          TensorTypeVariant{loss_tensor});
-
-  PerLayerElapsedTime per_layer_elapsed_time;
-  for (layer_guid_t const &node : reversed(
-           topological_ordering(this->training_backing.computation_graph))) {
-    std::optional<float> elapsed_time =
-        execute_backward(this->training_backing, node, this->allocator);
-    per_layer_elapsed_time.insert({node, elapsed_time});
-  }
-  return per_layer_elapsed_time;
-}
-
-void ModelTrainingInstance::update() {
-  for (layer_guid_t const &node :
-       topological_ordering(this->training_backing.computation_graph)) {
-    execute_update(
-        this->training_backing, node, this->optimizer_attrs, this->allocator);
-  }
-  this->optimizer_attrs =
-      get_optimizer_attrs_for_next_iter(this->optimizer_attrs);
-}
-
-GenericTensorAccessorR ModelTrainingInstance::get_loss_tensor_accessor() const {
-  GenericTensorAccessorW logit_tensor_backing = this->training_backing
-      .local_tensor_backing.tensor_backings.at(TensorTypeVariant{this->logit_tensor});
-
-
-  gradient_tensor_t loss_tensor =
-      this->training_backing.local_tensor_backing.tensor_gradient_mapping.at(
-          this->logit_tensor);
-  GenericTensorAccessorW loss_tensor_backing =
-      this->training_backing.local_tensor_backing.tensor_backings.at(
-          TensorTypeVariant{loss_tensor});
-  
-  return read_only_accessor_from_write_accessor(loss_tensor_backing);
-}
-
-} // namespace FlexFlow
diff --git a/lib/local-execution/src/task_binding.cc b/lib/local-execution/src/task_binding.cc
index 4537493c1d..3894fb8d34 100644
--- a/lib/local-execution/src/task_binding.cc
+++ b/lib/local-execution/src/task_binding.cc
@@ -1,46 +1,60 @@
 #include "task-spec/task_binding.h"
 #include "pcg/tensor_guid_t.dtg.h"
+#include "task-spec/training_tensor_guid_t.dtg.h"
 #include "utils/containers/contains_key.h"
 #include "utils/fmt/unordered_map.h"
+#include "utils/hash/tuple.h"
 #include "utils/hash/unordered_map.h"
 
 namespace FlexFlow {
 
-void TaskBinding::bind(int name, tensor_guid_t const &binding) {
+TaskBinding::TaskBinding() : tensor_bindings(), arg_bindings() {}
+
+TaskBinding::TaskBinding(
+    std::unordered_map<tensor_sub_slot_id_t, training_tensor_guid_t> const
+        &tensor_bindings,
+    std::unordered_map<slot_id_t, TaskArgSpec> const &arg_bindings)
+    : tensor_bindings(tensor_bindings), arg_bindings(arg_bindings) {}
+
+void TaskBinding::bind(int name, forward_tensor_guid_t const &binding) {
   this->bind(slot_id_t{name}, binding);
 }
 
-void TaskBinding::bind(slot_id_t name, tensor_guid_t const &binding) {
-  this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::FORWARD},
-                                TensorTypeVariant{binding}});
+void TaskBinding::bind(slot_id_t name, forward_tensor_guid_t const &binding) {
+  this->tensor_bindings.insert({tensor_sub_slot_id_t{name, TensorType::FORWARD},
+                                training_tensor_guid_t{binding}});
 }
 
-void TaskBinding::bind_grad(int name, gradient_tensor_t const &binding) {
+void TaskBinding::bind_grad(int name, gradient_tensor_guid_t const &binding) {
   this->bind_grad(slot_id_t{name}, binding);
 }
 
-void TaskBinding::bind_grad(slot_id_t name, gradient_tensor_t const &binding) {
-  this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::GRADIENT},
-                                TensorTypeVariant{binding}});
+void TaskBinding::bind_grad(slot_id_t name,
+                            gradient_tensor_guid_t const &binding) {
+  this->tensor_bindings.insert(
+      {tensor_sub_slot_id_t{name, TensorType::GRADIENT},
+       training_tensor_guid_t{binding}});
 }
 
-void TaskBinding::bind_optimizer(int name, optimizer_tensor_t const &binding) {
+void TaskBinding::bind_optimizer(int name,
+                                 optimizer_tensor_guid_t const &binding) {
   this->bind_optimizer(slot_id_t{name}, binding);
 }
 
 void TaskBinding::bind_optimizer(slot_id_t name,
-                                 optimizer_tensor_t const &binding) {
-  this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::OPTIMIZER},
-                                TensorTypeVariant{binding}});
+                                 optimizer_tensor_guid_t const &binding) {
+  this->tensor_bindings.insert(
+      {tensor_sub_slot_id_t{name, TensorType::OPTIMIZER},
+       training_tensor_guid_t{binding}});
 }
 
-void TaskBinding::bind_loss(int name, loss_tensor_t const &binding) {
+void TaskBinding::bind_loss(int name, loss_tensor_guid_t const &binding) {
   this->bind_loss(slot_id_t{name}, binding);
 }
 
-void TaskBinding::bind_loss(slot_id_t name, loss_tensor_t const &binding) {
-  this->tensor_bindings.insert(
-      {SlotTensorTypeId{name, TensorType::LOSS}, TensorTypeVariant{binding}});
+void TaskBinding::bind_loss(slot_id_t name, loss_tensor_guid_t const &binding) {
+  this->tensor_bindings.insert({tensor_sub_slot_id_t{name, TensorType::LOSS},
+                                training_tensor_guid_t{binding}});
 }
 
 void TaskBinding::insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec) {
@@ -56,13 +70,14 @@ bool TaskBinding::operator!=(TaskBinding const &other) const {
   return this->tie() != other.tie();
 }
 
-std::tuple<std::unordered_map<SlotTensorTypeId, TensorTypeVariant> const &,
-           std::unordered_map<slot_id_t, TaskArgSpec> const &>
+std::tuple<
+    std::unordered_map<tensor_sub_slot_id_t, training_tensor_guid_t> const &,
+    std::unordered_map<slot_id_t, TaskArgSpec> const &>
     TaskBinding::tie() const {
   return std::tie(this->tensor_bindings, this->arg_bindings);
 }
 
-std::unordered_map<SlotTensorTypeId, TensorTypeVariant> const &
+std::unordered_map<tensor_sub_slot_id_t, training_tensor_guid_t> const &
     TaskBinding::get_tensor_bindings() const {
   return this->tensor_bindings;
 }
@@ -90,10 +105,7 @@ namespace std {
 
 size_t hash<::FlexFlow::TaskBinding>::operator()(
     ::FlexFlow::TaskBinding const &s) const {
-  size_t result = 0;
-  hash_combine(result, s.get_tensor_bindings());
-  hash_combine(result, s.get_arg_bindings());
-  return result;
+  return ::FlexFlow::get_std_hash(s.tie());
 }
 
 } // namespace std
diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc
deleted file mode 100644
index ae3d97daa4..0000000000
--- a/lib/local-execution/src/task_registry.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-#include "local-execution/task_registry.h"
-#include "pcg/computation_graph.h"
-#include "task-spec/task_signature_impl.h"
-
-namespace FlexFlow {
-
-TaskRegistry construct_task_registry(
-    std::unordered_map<layer_guid_t, LayerAttrs> const &layer_attrs_mapping) {
-  std::unordered_map<layer_guid_t, std::optional<task_id_t>> init_task_ids;
-  std::unordered_map<layer_guid_t, std::optional<task_id_t>> fwd_task_ids;
-  std::unordered_map<layer_guid_t, std::optional<task_id_t>> bwd_task_ids;
-
-  std::unordered_map<task_id_t, TaskSignatureAndImpl> task_mapping;
-
-  for (std::pair<layer_guid_t, LayerAttrs> const &layer_attrs :
-       layer_attrs_mapping) {
-    layer_guid_t node = layer_attrs.first;
-    init_task_ids.insert({node, std::nullopt});
-    fwd_task_ids.insert({node, std::nullopt});
-    bwd_task_ids.insert({node, std::nullopt});
-
-    ComputationGraphOpAttrs attrs = layer_attrs.second.op_attrs;
-    std::vector<task_id_t> task_ids = get_task_ids(attrs);
-
-    for (task_id_t const &task_id : task_ids) {
-      TaskSignatureAndImpl task_signature_impl = get_task_sig_impl(task_id);
-      switch (task_signature_impl.task_signature.type) {
-        case OpTaskType::INIT:
-          assert(is_invocation_valid(task_signature_impl.task_signature,
-                                     init(attrs)));
-          init_task_ids[node] = task_id;
-          break;
-        case OpTaskType::FWD:
-          assert(is_invocation_valid(task_signature_impl.task_signature,
-                                     forward(attrs)));
-          fwd_task_ids[node] = task_id;
-          break;
-        case OpTaskType::BWD:
-          assert(is_invocation_valid(task_signature_impl.task_signature,
-                                     backward(attrs)));
-          bwd_task_ids[node] = task_id;
-          break;
-        default:
-          throw mk_runtime_error(
-              fmt::format("Invalid OpTaskType, got {}",
-                          task_signature_impl.task_signature.type));
-      }
-      task_mapping.insert({task_id, task_signature_impl});
-    }
-  }
-
-  return TaskRegistry{init_task_ids, fwd_task_ids, bwd_task_ids, task_mapping};
-}
-
-bool registry_contains_task_for_layer(TaskRegistry const &task_registry,
-                                      layer_guid_t const &op,
-                                      OpTaskType const &op_task_type) {
-  std::unordered_map<layer_guid_t, std::optional<task_id_t>> task_ids;
-  switch (op_task_type) {
-    case OpTaskType::INIT:
-      task_ids = task_registry.init_task_ids;
-      break;
-    case OpTaskType::FWD:
-      task_ids = task_registry.forward_task_ids;
-      break;
-    case OpTaskType::BWD:
-      task_ids = task_registry.backward_task_ids;
-      break;
-    default:
-      throw mk_runtime_error(
-          fmt::format("Invalid OpTaskType, got {}", op_task_type));
-  }
-
-  assert(task_ids.count(op));
-  return task_ids.at(op).has_value();
-}
-
-} // namespace FlexFlow
diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc
index ed181aea32..3ac7352e59 100644
--- a/lib/local-execution/src/tracked_allocator.cc
+++ b/lib/local-execution/src/tracked_allocator.cc
@@ -19,8 +19,8 @@ void TrackedAllocator::deallocate(void *ptr) {
   this->current_mem_usage -= psize;
 }
 
-size_t TrackedAllocator::get_current_mem_usage() {
-  return this->current_mem_usage;
+num_bytes_t TrackedAllocator::get_current_mem_usage() const {
+  return num_bytes_t{nonnegative_int{this->current_mem_usage}};
 }
 
 DeviceType TrackedAllocator::get_allocation_device_type() const {
diff --git a/lib/local-execution/src/unallocated_tensors.cc b/lib/local-execution/src/unallocated_tensors.cc
deleted file mode 100644
index b8daa90e3b..0000000000
--- a/lib/local-execution/src/unallocated_tensors.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-#include "local-execution/unallocated_tensors.h"
-#include "local-execution/allocated_tensors.h"
-#include "pcg/optimizer_attrs.h"
-
-namespace FlexFlow {
-
-UnallocatedTensors generate_unallocated_tensors(
-    AllocatedTensors const &allocated_tensors,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs_mapping,
-    GradientTensorSource &gradient_tensor_source) {
-
-  assert(are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping));
-
-  std::unordered_map<TensorTypeVariant, TensorShape> tensor_type_shapes;
-  std::unordered_map<tensor_guid_t, gradient_tensor_t> gradient_mapping;
-
-  for (std::pair<tensor_guid_t, TensorAttrs> const &tensor_guid_attrs :
-       tensor_attrs_mapping) {
-    tensor_guid_t tensor_guid = tensor_guid_attrs.first;
-    TensorAttrs tensor_attrs = tensor_guid_attrs.second;
-    TensorTypeVariant tensor_guid_type = TensorTypeVariant{tensor_guid};
-    if (!allocated_tensors.tensor_type_backings.count(tensor_guid_type)) {
-      tensor_type_shapes.insert({tensor_guid_type, tensor_attrs.shape});
-    }
-
-    if (tensor_attrs.create_grad == CreateGrad::YES &&
-        !allocated_tensors.gradient_mapping.count(tensor_guid)) {
-      gradient_tensor_t gradient_tensor =
-          gradient_tensor_source.new_gradient_tensor();
-      tensor_type_shapes.insert(
-          {TensorTypeVariant{gradient_tensor}, tensor_attrs.shape});
-      gradient_mapping.insert({tensor_guid, gradient_tensor});
-    }
-  }
-
-  return UnallocatedTensors{tensor_type_shapes, gradient_mapping, {}};
-}
-
-UnallocatedTensors generate_unallocated_tensors_with_optimizer(
-    AllocatedTensors const &allocated_tensors,
-    std::unordered_map<tensor_guid_t, TensorAttrs> const &tensor_attrs_mapping,
-    GradientTensorSource &gradient_tensor_source,
-    OptimizerTensorSource &optimizer_tensor_source,
-    OptimizerAttrs const &optimizer_attrs) {
-
-  UnallocatedTensors unallocated_tensors = generate_unallocated_tensors(
-      allocated_tensors, tensor_attrs_mapping, gradient_tensor_source);
-
-  if (!get_num_optimizer_tensors(optimizer_attrs)) {
-    return unallocated_tensors;
-  }
-
-  std::unordered_map<TensorTypeVariant, TensorShape> tensor_type_shapes =
-      unallocated_tensors.tensor_type_shapes;
-  std::unordered_map<tensor_guid_t, gradient_tensor_t> gradient_mapping =
-      unallocated_tensors.gradient_mapping;
-  std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
-      optimizer_mapping;
-
-  for (std::pair<tensor_guid_t, TensorAttrs> const &tensor_guid_attrs :
-       tensor_attrs_mapping) {
-    tensor_guid_t tensor_guid = tensor_guid_attrs.first;
-    TensorAttrs tensor_attrs = tensor_guid_attrs.second;
-    if (tensor_attrs.create_grad == CreateGrad::YES) {
-      std::vector<optimizer_tensor_t> optimizer_tensors;
-
-      int num_optimizer_tensors_to_allocate =
-          get_num_optimizer_tensors(optimizer_attrs);
-      if (allocated_tensors.optimizer_mapping.count(tensor_guid)) {
-        num_optimizer_tensors_to_allocate -=
-            allocated_tensors.optimizer_mapping.at(tensor_guid).size();
-      }
-
-      for (int i = 0; i < num_optimizer_tensors_to_allocate; ++i) {
-        optimizer_tensor_t optimizer_tensor =
-            optimizer_tensor_source.new_optimizer_tensor();
-        optimizer_tensors.push_back(optimizer_tensor);
-        tensor_type_shapes.insert(
-            {TensorTypeVariant{optimizer_tensor}, tensor_attrs.shape});
-      }
-
-      if (num_optimizer_tensors_to_allocate > 0) {
-        optimizer_mapping.insert({tensor_guid, optimizer_tensors});
-      }
-    }
-  }
-
-  return UnallocatedTensors{
-      tensor_type_shapes, gradient_mapping, optimizer_mapping};
-}
-
-} // namespace FlexFlow
diff --git a/lib/local-execution/test/src/test_utils.cc b/lib/local-execution/test/src/internal/test_utils.cc
similarity index 94%
rename from lib/local-execution/test/src/test_utils.cc
rename to lib/local-execution/test/src/internal/test_utils.cc
index b7a4e16b97..629640b6ae 100644
--- a/lib/local-execution/test/src/test_utils.cc
+++ b/lib/local-execution/test/src/internal/test_utils.cc
@@ -1,4 +1,4 @@
-#include "test_utils.h"
+#include "internal/test_utils.h"
 #include "pcg/tensor_guid_t.dtg.h"
 
 namespace FlexFlow {
diff --git a/lib/local-execution/test/src/test_utils.h b/lib/local-execution/test/src/internal/test_utils.h
similarity index 100%
rename from lib/local-execution/test/src/test_utils.h
rename to lib/local-execution/test/src/internal/test_utils.h
diff --git a/lib/local-execution/test/src/local-execution/local_cost_estimator.cc b/lib/local-execution/test/src/local-execution/local_cost_estimator.cc
new file mode 100644
index 0000000000..107b835383
--- /dev/null
+++ b/lib/local-execution/test/src/local-execution/local_cost_estimator.cc
@@ -0,0 +1,142 @@
+#include "local-execution/local_cost_estimator.h"
+#include "doctest/doctest.h"
+#include "internal/test_utils.h"
+#include "kernels/device_handle_t.h"
+#include "kernels/managed_per_device_ff_handle.h"
+#include "op-attrs/ops/attention.h"
+#include "op-attrs/ops/cast.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "pcg/computation_graph_builder.h"
+#include "pcg/machine_view.h"
+#include "task-spec/runtime_arg_config.h"
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("LocalCostEstimator") {
+    RuntimeArgConfig runtime_arg_config =
+        cpu_make_runtime_arg_config(EnableProfiling::YES,
+                                    ProfilingSettings{/*warmup_iters=*/0,
+                                                      /*measure_iters=*/1});
+
+    OptimizerAttrs optimizer_attrs = OptimizerAttrs{
+        SGDOptimizerAttrs{
+            /*lr=*/0.1,
+            /*momentum=*/0.1,
+            /*nesterov=*/false,
+            /*weight_decay=*/0.1,
+        },
+    };
+
+    CostEstimator cost_estimator = get_local_cost_estimator(runtime_arg_config);
+
+    SUBCASE("estimate operator cost") {
+      CastAttrs attrs = CastAttrs{
+          /*dtype=*/DataType::INT32,
+      };
+
+      ParallelTensorShape input_shape = lift_to_parallel(TensorShape{
+          TensorDims{FFOrdered<positive_int>{40_p, 48_p, 36_p}},
+          DataType::FLOAT,
+      });
+
+      ParallelTensorShape output_shape =
+          throw_if_unexpected(get_output_shape(attrs, input_shape));
+
+      OpCostEstimateKey op_cost_estimate_key = OpCostEstimateKey{
+          /*op_attrs=*/PCGOperatorAttrs{attrs},
+          /*input_shapes=*/{input_shape},
+          /*weight_shapes=*/{},
+          /*output_shapes=*/{output_shape},
+          /*optimizer_attrs=*/optimizer_attrs,
+          /*machine_view=*/
+          make_1d_machine_view(
+              MachineSpaceCoordinate{0_n, 0_n, DeviceType::CPU},
+              MachineSpecificationDimension::INTRA_NODE,
+              stride_t{1_p}),
+      };
+
+      OpCostMetrics result = cost_estimator.estimate_cost(op_cost_estimate_key);
+
+      CHECK(result.forward_runtime > 0_ms);
+      CHECK(result.backward_runtime > 0_ms);
+      CHECK(result.memory_usage > 0_bytes);
+    }
+  }
+}
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("LocalCostEstimator (CUDA)") {
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
+
+    RuntimeArgConfig runtime_arg_config =
+        gpu_make_runtime_arg_config(managed_handle.raw_handle(),
+                                    EnableProfiling::YES,
+                                    ProfilingSettings{/*warmup_iters=*/0,
+                                                      /*measure_iters=*/1});
+
+    OptimizerAttrs optimizer_attrs = OptimizerAttrs{
+        SGDOptimizerAttrs{
+            /*lr=*/0.1,
+            /*momentum=*/0.1,
+            /*nesterov=*/false,
+            /*weight_decay=*/0.1,
+        },
+    };
+
+    CostEstimator cost_estimator = get_local_cost_estimator(runtime_arg_config);
+
+    SUBCASE("estimate operator cost") {
+      positive_int embed_dim = 32_p;
+      positive_int num_heads = 10_p;
+      MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{
+          /*embed_dim=*/embed_dim,
+          /*num_heads=*/num_heads,
+          /*kdim=*/embed_dim,
+          /*vdim=*/embed_dim,
+          /*dropout=*/0.0,
+          /*bias=*/false,
+          /*add_bias_kv=*/false,
+          /*add_zero_attn=*/false,
+      };
+
+      positive_int batch_size = 40_p;
+      positive_int seq_len = 48_p;
+      positive_int feature_size = 36_p;
+
+      DataType dtype = DataType::FLOAT;
+      ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{
+          TensorDims{
+              FFOrdered<positive_int>{batch_size, seq_len, feature_size}},
+          DataType::FLOAT,
+      });
+
+      ParallelTensorShape weights_shape = throw_if_unexpected(
+          get_weights_shape(attrs, inputs_shape, inputs_shape, inputs_shape));
+
+      ParallelTensorShape output_shape = throw_if_unexpected(
+          get_output_shape(attrs, inputs_shape, inputs_shape, inputs_shape));
+
+      OpCostEstimateKey op_cost_estimate_key = OpCostEstimateKey{
+          /*op_attrs=*/PCGOperatorAttrs{attrs},
+          /*input_shapes=*/{inputs_shape, inputs_shape, inputs_shape},
+          /*weight_shapes=*/{weights_shape},
+          /*output_shapes=*/{output_shape},
+          /*optimizer_attrs=*/optimizer_attrs,
+          /*machine_view=*/
+          make_1d_machine_view(
+              MachineSpaceCoordinate{0_n, 0_n, DeviceType::GPU},
+              MachineSpecificationDimension::INTRA_NODE,
+              stride_t{1_p}),
+      };
+
+      OpCostMetrics result = cost_estimator.estimate_cost(op_cost_estimate_key);
+
+      CHECK(result.forward_runtime > 0_ms);
+      CHECK(result.backward_runtime > 0_ms);
+      CHECK(result.memory_usage > 0_bytes);
+    }
+  }
+}
diff --git a/lib/local-execution/test/src/test_local_task_arg_accessor.cc b/lib/local-execution/test/src/local-execution/local_task_argument_accessor.cc
similarity index 86%
rename from lib/local-execution/test/src/test_local_task_arg_accessor.cc
rename to lib/local-execution/test/src/local-execution/local_task_argument_accessor.cc
index 5c11010e2a..482795b278 100644
--- a/lib/local-execution/test/src/test_local_task_arg_accessor.cc
+++ b/lib/local-execution/test/src/local-execution/local_task_argument_accessor.cc
@@ -1,6 +1,6 @@
+#include "local-execution/local_task_argument_accessor.h"
 #include "doctest/doctest.h"
 #include "kernels/local_cpu_allocator.h"
-#include "local-execution/local_task_argument_accessor.h"
 #include "task-spec/task_signature_impl.h"
 #include "utils/fmt/variant.h"
 
@@ -36,16 +36,33 @@ TEST_SUITE(FF_TEST_SUITE) {
       VARIADIC_TENSORS,
     };
 
-    TensorSlotsBacking tensor_slots_backing = {
-        {SlotTensorTypeId{slot_id_t{INPUT}, TensorType::FORWARD}, input},
-        {SlotTensorTypeId{slot_id_t{INPUT}, TensorType::GRADIENT}, input_grad},
-        {SlotTensorTypeId{slot_id_t{VARIADIC_TENSORS}, TensorType::FORWARD},
-         variadic_tensors},
-        {SlotTensorTypeId{slot_id_t{VARIADIC_TENSORS}, TensorType::GRADIENT},
-         variadic_tensors_grad},
-    };
+    std::unordered_map<tensor_sub_slot_id_t, TensorSlotBacking>
+        tensor_slots_backing = {
+            {
+                tensor_sub_slot_id_t{slot_id_t{INPUT}, TensorType::FORWARD},
+                TensorSlotBacking{input},
+            },
+            {
+                tensor_sub_slot_id_t{slot_id_t{INPUT}, TensorType::GRADIENT},
+                TensorSlotBacking{input_grad},
+            },
+            {
+                tensor_sub_slot_id_t{slot_id_t{VARIADIC_TENSORS},
+                                     TensorType::FORWARD},
+                TensorSlotBacking{variadic_tensors},
+            },
+            {
+                tensor_sub_slot_id_t{slot_id_t{VARIADIC_TENSORS},
+                                     TensorType::GRADIENT},
+                TensorSlotBacking{variadic_tensors_grad},
+            },
+        };
 
-    LocalTaskArgumentAccessor acc = {allocator, tensor_slots_backing, {}};
+    LocalTaskArgumentAccessor acc = LocalTaskArgumentAccessor{
+        /*allocator=*/allocator,
+        /*tensor_slots_backing=*/tensor_slots_backing,
+        /*arg_slots_backing=*/{},
+    };
 
     SUBCASE("get_tensor") {
       SUBCASE("get_tensor(slot_id_t, Permissions::RO, TensorType::FORWARD)") {
@@ -55,6 +72,7 @@ TEST_SUITE(FF_TEST_SUITE) {
             slot_id_t{INPUT}, Permissions::RO, TensorType::FORWARD);
         CHECK(correct == result);
       }
+
       SUBCASE("get_tensor(slot_id_t, Permissions::RO, TensorType::GRADIENT)") {
         GenericTensorAccessor correct = GenericTensorAccessor{
             read_only_accessor_from_write_accessor(input_grad)};
@@ -62,24 +80,28 @@ TEST_SUITE(FF_TEST_SUITE) {
             slot_id_t{INPUT}, Permissions::RO, TensorType::GRADIENT);
         CHECK(correct == result);
       }
+
       SUBCASE("get_tensor(slot_id_t, Permissions::WO, TensorType::FORWARD)") {
         GenericTensorAccessor correct = GenericTensorAccessor{input};
         GenericTensorAccessor result = acc.get_tensor(
             slot_id_t{INPUT}, Permissions::WO, TensorType::FORWARD);
         CHECK(correct == result);
       }
+
       SUBCASE("get_tensor(slot_id_t, Permissions::WO, TensorType::GRADIENT)") {
         GenericTensorAccessor correct = GenericTensorAccessor{input_grad};
         GenericTensorAccessor result = acc.get_tensor(
             slot_id_t{INPUT}, Permissions::WO, TensorType::GRADIENT);
         CHECK(correct == result);
       }
+
       SUBCASE("get_tensor(slot_id_t, Permissions::RW, TensorType::FORWARD)") {
         GenericTensorAccessor correct = GenericTensorAccessor{input};
         GenericTensorAccessor result = acc.get_tensor(
             slot_id_t{INPUT}, Permissions::RW, TensorType::FORWARD);
         CHECK(correct == result);
       }
+
       SUBCASE("get_tensor(slot_id_t, Permissions::RW, TensorType::GRADIENT)") {
         GenericTensorAccessor correct = GenericTensorAccessor{input_grad};
         GenericTensorAccessor result = acc.get_tensor(
@@ -100,6 +122,7 @@ TEST_SUITE(FF_TEST_SUITE) {
             slot_id_t{VARIADIC_TENSORS}, Permissions::RO, TensorType::FORWARD);
         CHECK(result == correct);
       }
+
       SUBCASE("get_variadic_tensor(slot_id_t, Permissions::RO, "
               "TensorType::GRADIENT)") {
         VariadicGenericTensorAccessor correct =
@@ -112,6 +135,7 @@ TEST_SUITE(FF_TEST_SUITE) {
             slot_id_t{VARIADIC_TENSORS}, Permissions::RO, TensorType::GRADIENT);
         CHECK(result == correct);
       }
+
       SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, "
               "TensorType::FORWARD)") {
         VariadicGenericTensorAccessor correct =
@@ -120,6 +144,7 @@ TEST_SUITE(FF_TEST_SUITE) {
             slot_id_t{VARIADIC_TENSORS}, Permissions::WO, TensorType::FORWARD);
         CHECK(result == correct);
       }
+
       SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, "
               "TensorType::GRADIENT)") {
         VariadicGenericTensorAccessor correct =
@@ -128,6 +153,7 @@ TEST_SUITE(FF_TEST_SUITE) {
             slot_id_t{VARIADIC_TENSORS}, Permissions::WO, TensorType::GRADIENT);
         CHECK(result == correct);
       }
+
       SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, "
               "TensorType::FORWARD)") {
         VariadicGenericTensorAccessor correct =
@@ -136,6 +162,7 @@ TEST_SUITE(FF_TEST_SUITE) {
             slot_id_t{VARIADIC_TENSORS}, Permissions::RW, TensorType::FORWARD);
         CHECK(result == correct);
       }
+
       SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, "
               "TensorType::GRADIENT)") {
         VariadicGenericTensorAccessor correct =
diff --git a/lib/local-execution/test/src/local-execution/local_task_registry.cc b/lib/local-execution/test/src/local-execution/local_task_registry.cc
new file mode 100644
index 0000000000..27cd74b2a6
--- /dev/null
+++ b/lib/local-execution/test/src/local-execution/local_task_registry.cc
@@ -0,0 +1,278 @@
+#include "local-execution/local_task_registry.h"
+#include "kernels/local_cuda_allocator.h"
+#include "local-execution/local_cost_estimator.h"
+#include "local-execution/local_task_registry.dtg.h"
+#include "local-execution/operator_task_set.h"
+#include "local-execution/registered_task.h"
+#include "pcg/computation_graph_builder.h"
+#include "pcg/layer_guid_t.dtg.h"
+#include "task-spec/task_signature_impl.h"
+#include "utils/fmt/optional.h"
+#include "utils/fmt/unordered_map.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("LocalTaskRegistry") {
+    layer_guid_t layer_guid = layer_guid_t{Node{0}};
+    positive_int embed_dim = 32_p;
+    positive_int num_heads = 10_p;
+    ComputationGraphOpAttrs attrs =
+        ComputationGraphOpAttrs{MultiHeadAttentionAttrs{
+            /*embed_dim=*/embed_dim,
+            /*num_heads=*/num_heads,
+            /*kdim=*/embed_dim,
+            /*vdim=*/embed_dim,
+            /*dropout=*/0.0,
+            /*bias=*/true,
+            /*add_bias_kv=*/false,
+            /*add_zero_attn=*/false,
+        }};
+
+    OperatorTaskSet mha_task_set = get_task_set_for_operator(attrs);
+    {
+      OperatorTaskSet expected_mha_task_set = OperatorTaskSet{
+          /*init_task=*/registered_task_t{task_id_t::ATTENTION_INIT_TASK_ID},
+          /*fwd_task=*/registered_task_t{task_id_t::ATTENTION_FWD_TASK_ID},
+          /*bwd_task=*/registered_task_t{task_id_t::ATTENTION_BWD_TASK_ID},
+      };
+      REQUIRE(mha_task_set == expected_mha_task_set);
+    }
+
+    std::unordered_map<task_id_t, TaskSignatureAndImpl> mha_task_mapping = {
+        {task_id_t::ATTENTION_INIT_TASK_ID,
+         get_task_signature_and_impl_for_task_id(
+             task_id_t::ATTENTION_INIT_TASK_ID)},
+        {task_id_t::ATTENTION_FWD_TASK_ID,
+         get_task_signature_and_impl_for_task_id(
+             task_id_t::ATTENTION_FWD_TASK_ID)},
+        {task_id_t::ATTENTION_BWD_TASK_ID,
+         get_task_signature_and_impl_for_task_id(
+             task_id_t::ATTENTION_BWD_TASK_ID)},
+    };
+
+    SUBCASE("register single layer") {
+      LocalTaskRegistry task_registry =
+          construct_local_task_registry_for_layers(
+              {{layer_guid, LayerAttrs{attrs, std::nullopt}}});
+
+      LocalTaskRegistry correct_task_registry = [&] {
+        std::unordered_map<layer_guid_t, OperatorTaskSet> task_sets = {
+            {
+                layer_guid,
+                mha_task_set,
+            },
+        };
+
+        return LocalTaskRegistry{
+            /*task_sets=*/{
+                {layer_guid, mha_task_set},
+            },
+            /*task_mapping=*/mha_task_mapping,
+        };
+      }();
+
+      CHECK(task_registry == correct_task_registry);
+    }
+
+    SUBCASE("multiple layers same task") {
+      layer_guid_t other_layer_guid = layer_guid_t{Node{1}};
+      LocalTaskRegistry task_registry =
+          construct_local_task_registry_for_layers({
+              {layer_guid, LayerAttrs{attrs, std::nullopt}},
+              {other_layer_guid, LayerAttrs{attrs, std::nullopt}},
+          });
+
+      SUBCASE("layer to task ids") {
+        std::unordered_map<layer_guid_t, OperatorTaskSet> correct = {
+            {layer_guid, mha_task_set},
+            {other_layer_guid, mha_task_set},
+        };
+        CHECK(task_registry.task_sets == correct);
+      }
+
+      SUBCASE("task to signature+impl mapping") {
+        std::unordered_map<task_id_t, TaskSignatureAndImpl> correct =
+            mha_task_mapping;
+
+        CHECK(task_registry.task_mapping == correct);
+      }
+    }
+
+    SUBCASE("different attrs, still same task fn mapping") {
+      layer_guid_t layer_1 = layer_guid_t{Node{1}};
+      positive_int embed_dim = 100_p;
+      layer_guid_t layer_2 = layer_guid_t{Node{2}};
+      ComputationGraphOpAttrs other_attrs =
+          ComputationGraphOpAttrs{MultiHeadAttentionAttrs{
+              /*embed_dim=*/embed_dim,
+              /*num_heads=*/num_heads,
+              /*kdim=*/embed_dim,
+              /*vdim=*/embed_dim,
+              /*dropout=*/0.0,
+              /*bias=*/true,
+              /*add_bias_kv=*/false,
+              /*add_zero_attn=*/false,
+          }};
+      LocalTaskRegistry task_registry =
+          construct_local_task_registry_for_layers({
+              {layer_guid, LayerAttrs{attrs, std::nullopt}},
+              {layer_1, LayerAttrs{attrs, std::nullopt}},
+              {layer_2, LayerAttrs{other_attrs, std::nullopt}},
+          });
+
+      std::unordered_map<task_id_t, TaskSignatureAndImpl> correct_task_mapping =
+          mha_task_mapping;
+
+      CHECK(task_registry.task_mapping == correct_task_mapping);
+    }
+
+    SUBCASE("equality") {
+      SUBCASE("different attrs is still equal") {
+        positive_int embed_dim = 100_p;
+        ComputationGraphOpAttrs other_attrs =
+            ComputationGraphOpAttrs{MultiHeadAttentionAttrs{
+                /*embed_dim=*/embed_dim,
+                /*num_heads=*/num_heads,
+                /*kdim=*/embed_dim,
+                /*vdim=*/embed_dim,
+                /*dropout=*/0.0,
+                /*bias=*/true,
+                /*add_bias_kv=*/false,
+                /*add_zero_attn=*/false,
+            }};
+
+        LocalTaskRegistry task_registry =
+            construct_local_task_registry_for_layers(
+                {{layer_guid, LayerAttrs{attrs, std::nullopt}}});
+        LocalTaskRegistry other_task_registry =
+            construct_local_task_registry_for_layers(
+                {{layer_guid, LayerAttrs{other_attrs, std::nullopt}}});
+
+        CHECK(task_registry == other_task_registry);
+      }
+
+      SUBCASE("different layer_guid is not equal") {
+        LocalTaskRegistry task_registry =
+            construct_local_task_registry_for_layers(
+                {{layer_guid, LayerAttrs{attrs, std::nullopt}}});
+        layer_guid_t other_layer_guid = layer_guid_t{Node{1}};
+        LocalTaskRegistry other_task_registry =
+            construct_local_task_registry_for_layers(
+                {{other_layer_guid, LayerAttrs{attrs, std::nullopt}}});
+
+        CHECK(task_registry != other_task_registry);
+      }
+    }
+
+    SUBCASE("try_get_registered_task") {
+      SUBCASE("Task exists") {
+        LocalTaskRegistry task_registry =
+            construct_local_task_registry_for_layers({
+                {layer_guid, LayerAttrs{attrs, std::nullopt}},
+            });
+
+        SUBCASE("Init") {
+          std::optional<registered_task_t> result = try_get_registered_task(
+              task_registry, layer_guid, OpTaskType::INIT);
+          std::optional<registered_task_t> correct = registered_task_t{
+              task_id_t::ATTENTION_INIT_TASK_ID,
+          };
+
+          CHECK(result == correct);
+        }
+
+        SUBCASE("Fwd") {
+          std::optional<registered_task_t> result = try_get_registered_task(
+              task_registry, layer_guid, OpTaskType::FWD);
+          std::optional<registered_task_t> correct = registered_task_t{
+              task_id_t::ATTENTION_FWD_TASK_ID,
+          };
+
+          CHECK(result == correct);
+        }
+
+        SUBCASE("Bwd") {
+          std::optional<registered_task_t> result = try_get_registered_task(
+              task_registry, layer_guid, OpTaskType::BWD);
+          std::optional<registered_task_t> correct = registered_task_t{
+              task_id_t::ATTENTION_BWD_TASK_ID,
+          };
+
+          CHECK(result == correct);
+        }
+      }
+
+      SUBCASE("Partial task does not exist") {
+        ComputationGraphOpAttrs bmm_attrs = ComputationGraphOpAttrs{
+            BatchMatmulAttrs{/*a_seq_length_dim=*/10_n,
+                             /*b_seq_length_dim=*/20_n}};
+        LocalTaskRegistry task_registry =
+            construct_local_task_registry_for_layers({
+                {layer_guid, LayerAttrs{bmm_attrs, std::nullopt}},
+            });
+
+        SUBCASE("Init") {
+          std::optional<registered_task_t> result = try_get_registered_task(
+              task_registry, layer_guid, OpTaskType::INIT);
+          std::optional<registered_task_t> correct =
+              make_noop_registered_task();
+
+          CHECK(result == correct);
+        }
+
+        SUBCASE("Fwd") {
+          std::optional<registered_task_t> result = try_get_registered_task(
+              task_registry, layer_guid, OpTaskType::FWD);
+          std::optional<registered_task_t> correct = registered_task_t{
+              task_id_t::BATCHMATMUL_FWD_TASK_ID,
+          };
+
+          CHECK(result == correct);
+        }
+
+        SUBCASE("Bwd") {
+          std::optional<registered_task_t> result = try_get_registered_task(
+              task_registry, layer_guid, OpTaskType::BWD);
+          std::optional<registered_task_t> correct = registered_task_t{
+              task_id_t::BATCHMATMUL_BWD_TASK_ID,
+          };
+
+          CHECK(result == correct);
+        }
+      }
+
+      SUBCASE("Empty tasks") {
+        LocalTaskRegistry task_registry = LocalTaskRegistry{
+            /*task_sets=*/{},
+            /*task_mapping=*/{},
+        };
+
+        SUBCASE("Init") {
+          std::optional<registered_task_t> result = try_get_registered_task(
+              task_registry, layer_guid, OpTaskType::INIT);
+          std::optional<registered_task_t> correct = std::nullopt;
+
+          CHECK(result == correct);
+        }
+
+        SUBCASE("Fwd") {
+          std::optional<registered_task_t> result = try_get_registered_task(
+              task_registry, layer_guid, OpTaskType::FWD);
+          std::optional<registered_task_t> correct = std::nullopt;
+
+          CHECK(result == correct);
+        }
+
+        SUBCASE("Bwd") {
+          std::optional<registered_task_t> result = try_get_registered_task(
+              task_registry, layer_guid, OpTaskType::BWD);
+          std::optional<registered_task_t> correct = std::nullopt;
+
+          CHECK(result == correct);
+        }
+      }
+    }
+  }
+}
diff --git a/lib/local-execution/test/src/local-execution/local_tensor_backing.cc b/lib/local-execution/test/src/local-execution/local_tensor_backing.cc
new file mode 100644
index 0000000000..2f5bf493d6
--- /dev/null
+++ b/lib/local-execution/test/src/local-execution/local_tensor_backing.cc
@@ -0,0 +1,285 @@
+#include "local-execution/local_tensor_backing.h"
+#include "internal/test_utils.h"
+#include "kernels/local_cpu_allocator.h"
+#include "task-spec/gradient_tensor_source.h"
+#include "task-spec/loss_tensor_source.h"
+#include "task-spec/optimizer_tensor_source.h"
+#include "test/utils/doctest/check_kv.h"
+#include "test/utils/doctest/fmt/unordered_map.h"
+#include "utils/containers/keys.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+bool is_shape_and_dtype_equal_for_tensor_backings(
+    LocalTensorBacking const &b1, LocalTensorBacking const &b2) {
+
+  std::unordered_map<training_tensor_guid_t, GenericTensorAccessorW> m1 =
+      b1.backing_for_training_tensor_map;
+  std::unordered_map<training_tensor_guid_t, GenericTensorAccessorW> m2 =
+      b2.backing_for_training_tensor_map;
+
+  if (keys(m1) == keys(m2)) {
+    for (std::pair<training_tensor_guid_t, GenericTensorAccessorW> const
+             &tensor_type_backing : m1) {
+      if (tensor_type_backing.second.shape ==
+          m2.at(tensor_type_backing.first).shape) {
+        continue;
+      } else {
+        return false;
+      }
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("construct_local_tensor_backing") {
+    Allocator allocator = create_local_cpu_memory_allocator();
+
+    training_tensor_guid_t t1 =
+        training_tensor_guid_t{forward_tensor_guid_t{4}};
+    training_tensor_guid_t t2 =
+        training_tensor_guid_t{gradient_tensor_guid_t{4}};
+    training_tensor_guid_t t3 =
+        training_tensor_guid_t{gradient_tensor_guid_t{5}};
+    training_tensor_guid_t t4 =
+        training_tensor_guid_t{gradient_tensor_guid_t{6}};
+
+    TensorShape tensor_shape_1 = TensorShape{
+        TensorDims{FFOrdered{
+            4_p,
+            5_p,
+        }},
+        DataType::FLOAT,
+    };
+
+    TensorShape tensor_shape_2 = TensorShape{
+        TensorDims{FFOrdered{
+            4_p,
+            5_p,
+        }},
+        DataType::FLOAT,
+    };
+
+    std::unordered_map<training_tensor_guid_t, TensorShape>
+        training_tensor_shapes = {
+            {t1, tensor_shape_1},
+            {t2, tensor_shape_2},
+            {t3, tensor_shape_1},
+        };
+
+    GenericTensorAccessorW t3_accessor =
+        allocator.allocate_tensor(tensor_shape_2);
+    SUBCASE("allocates all non-preallocated tensors and does not re-allocate "
+            "the preallocated ones") {
+      std::unordered_map<training_tensor_guid_t, GenericTensorAccessorW>
+          preallocated_tensors = {
+              {t3, t3_accessor},
+          };
+
+      LocalTensorBacking result = construct_local_tensor_backing(
+          /*training_tensor_shapes=*/training_tensor_shapes,
+          /*preallocated_tensors=*/preallocated_tensors,
+          /*allocator=*/allocator);
+      LocalTensorBacking correct = LocalTensorBacking{
+          /*backing_for_training_tensor_map=*/{
+              {t3, t3_accessor},
+              {t1, allocator.allocate_tensor(tensor_shape_1)},
+              {t2, allocator.allocate_tensor(tensor_shape_2)},
+          },
+      };
+
+      CHECK_MESSAGE(
+          is_shape_and_dtype_equal_for_tensor_backings(result, correct),
+          check_kv("result", fmt::to_string(result)),
+          check_kv("correct", fmt::to_string(correct)));
+
+      CHECK(get_accessor_for_training_tensor(result, t3) == t3_accessor);
+    }
+
+    SUBCASE("fails if a preallocated tensor is not in training_tensor_shapes") {
+      std::unordered_map<training_tensor_guid_t, GenericTensorAccessorW>
+          preallocated_tensors = {
+              {t4, t3_accessor},
+          };
+
+      CHECK_THROWS(construct_local_tensor_backing(
+          /*training_tensor_shapes=*/training_tensor_shapes,
+          /*preallocated_tensors=*/preallocated_tensors,
+          /*allocator=*/allocator));
+    }
+  }
+
+  TEST_CASE("get_accessor_for_training_tensor") {
+    Allocator allocator = create_local_cpu_memory_allocator();
+
+    TensorShape tensor_shape = TensorShape{
+        TensorDims{FFOrdered{
+            4_p,
+            5_p,
+        }},
+        DataType::FLOAT,
+    };
+
+    training_tensor_guid_t t1 =
+        training_tensor_guid_t{forward_tensor_guid_t{4}};
+    training_tensor_guid_t t2 =
+        training_tensor_guid_t{gradient_tensor_guid_t{4}};
+
+    GenericTensorAccessorW t1_accessor =
+        allocator.allocate_tensor(tensor_shape);
+    GenericTensorAccessorW t2_accessor =
+        allocator.allocate_tensor(tensor_shape);
+
+    LocalTensorBacking local_tensor_backing = LocalTensorBacking{
+        /*backing_for_training_tensor_map=*/{{
+                                                 t1,
+                                                 t1_accessor,
+                                             },
+                                             {
+                                                 t2,
+                                                 t2_accessor,
+                                             }},
+    };
+
+    SUBCASE("returns corresponding accessor if training tensor is present") {
+      GenericTensorAccessorW result =
+          get_accessor_for_training_tensor(local_tensor_backing, t1);
+      GenericTensorAccessorW correct = t1_accessor;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("fails if the training tensor is not present") {
+      training_tensor_guid_t t3 =
+          training_tensor_guid_t{optimizer_tensor_guid_t{4}};
+      training_tensor_guid_t t4 =
+          training_tensor_guid_t{forward_tensor_guid_t{3}};
+
+      CHECK_THROWS(get_accessor_for_training_tensor(local_tensor_backing, t3));
+      CHECK_THROWS(get_accessor_for_training_tensor(local_tensor_backing, t4));
+    }
+  }
+
+  TEST_CASE("construct_tensor_slots_backing_for_binding") {
+    enum Slots {
+      TENSOR_SLOT_1,
+      TENSOR_SLOT_2,
+      TENSOR_SLOT_3,
+      ARG_SLOT,
+    };
+
+    Allocator allocator = create_local_cpu_memory_allocator();
+
+    TensorShape tensor_shape = TensorShape{
+        TensorDims{FFOrdered{
+            4_p,
+            5_p,
+        }},
+        DataType::FLOAT,
+    };
+
+    training_tensor_guid_t t1 =
+        training_tensor_guid_t{forward_tensor_guid_t{4}};
+    training_tensor_guid_t t2 =
+        training_tensor_guid_t{forward_tensor_guid_t{5}};
+    training_tensor_guid_t t3 =
+        training_tensor_guid_t{forward_tensor_guid_t{6}};
+    training_tensor_guid_t t4 =
+        training_tensor_guid_t{gradient_tensor_guid_t{5}};
+
+    GenericTensorAccessorW t1_accessor =
+        allocator.allocate_tensor(tensor_shape);
+    GenericTensorAccessorW t2_accessor =
+        allocator.allocate_tensor(tensor_shape);
+    GenericTensorAccessorW t3_accessor =
+        allocator.allocate_tensor(tensor_shape);
+    GenericTensorAccessorW t4_accessor =
+        allocator.allocate_tensor(tensor_shape);
+
+    tensor_sub_slot_id_t tensor_slot_1_forward = tensor_sub_slot_id_t{
+        slot_id_t{TENSOR_SLOT_1},
+        TensorType::FORWARD,
+    };
+    tensor_sub_slot_id_t tensor_slot_1_gradient = tensor_sub_slot_id_t{
+        slot_id_t{TENSOR_SLOT_1},
+        TensorType::GRADIENT,
+    };
+    tensor_sub_slot_id_t tensor_slot_2_forward = tensor_sub_slot_id_t{
+        slot_id_t{TENSOR_SLOT_2},
+        TensorType::FORWARD,
+    };
+    tensor_sub_slot_id_t tensor_slot_3_forward = tensor_sub_slot_id_t{
+        slot_id_t{TENSOR_SLOT_3},
+        TensorType::FORWARD,
+    };
+
+    LocalTensorBacking local_tensor_backing = LocalTensorBacking{
+        /*backing_for_training_tensor_map=*/{{
+                                                 t1,
+                                                 t1_accessor,
+                                             },
+                                             {
+                                                 t2,
+                                                 t2_accessor,
+                                             },
+                                             {
+                                                 t3,
+                                                 t3_accessor,
+                                             },
+                                             {
+                                                 t4,
+                                                 t4_accessor,
+                                             }},
+    };
+
+    TaskBinding task_binding = TaskBinding{
+        /*tensor_bindings=*/{
+            {
+                tensor_slot_1_forward,
+                t1,
+            },
+            {
+                tensor_slot_2_forward,
+                t2,
+            },
+            {
+                tensor_slot_1_gradient,
+                t4,
+            },
+        },
+        /*arg_bindings=*/
+        {
+            {
+                slot_id_t{ARG_SLOT},
+                TaskArgSpec{
+                    ConcreteArgSpec::create<int>(4),
+                },
+            },
+        },
+    };
+
+    std::unordered_map<tensor_sub_slot_id_t, TensorSlotBacking> result =
+        construct_tensor_slots_backing_for_binding(local_tensor_backing,
+                                                   task_binding);
+    std::unordered_map<tensor_sub_slot_id_t, TensorSlotBacking> correct = {
+        {
+            tensor_slot_1_forward,
+            TensorSlotBacking{t1_accessor},
+        },
+        {
+            tensor_slot_2_forward,
+            TensorSlotBacking{t2_accessor},
+        },
+        {
+            tensor_slot_1_gradient,
+            TensorSlotBacking{t4_accessor},
+        },
+    };
+
+    CHECK(result == correct);
+  }
+}
diff --git a/lib/local-execution/test/src/test_update.cc b/lib/local-execution/test/src/local-execution/local_training_backing.cc
similarity index 68%
rename from lib/local-execution/test/src/test_update.cc
rename to lib/local-execution/test/src/local-execution/local_training_backing.cc
index 54c64e6b6c..5436dbdbb7 100644
--- a/lib/local-execution/test/src/test_update.cc
+++ b/lib/local-execution/test/src/local-execution/local_training_backing.cc
@@ -1,18 +1,23 @@
-#include "doctest/doctest.h"
+#include "local-execution/local_training_backing.h"
+#include "internal/test_utils.h"
 #include "kernels/local_cuda_allocator.h"
 #include "kernels/managed_ff_stream.h"
 #include "kernels/managed_per_device_ff_handle.h"
-#include "local-execution/allocated_tensors.h"
-#include "local-execution/local_training_backing.h"
 #include "pcg/computation_graph.h"
 #include "pcg/computation_graph_builder.h"
 #include "pcg/optimizer_attrs.dtg.h"
-#include "test_utils.h"
+#include "task-spec/forward_tensor_source.h"
+#include "task-spec/gradient_tensor_source.h"
+#include "task-spec/optimizer_tensor_source.h"
+#include "task-spec/runtime_arg_config.h"
+#include "task-spec/training_computation_graph.h"
+#include "utils/containers/get_only.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_CUDA_TEST_SUITE) {
-  TEST_CASE("ExecuteUpdate") {
+  TEST_CASE("execute_update") {
     // initialize runtime configs
     ManagedFFStream managed_stream{};
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
@@ -20,7 +25,6 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         /*allowTensorOpMathConversion=*/true);
 
     Allocator allocator = create_local_cuda_memory_allocator();
-    AllocatedTensors allocated_tensors = make_empty_allocated_tensors();
 
     // construct computation graph
     ComputationGraph computation_graph = make_empty_computation_graph();
@@ -56,14 +60,35 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
                    "linear"},
         inputs_layer.outputs,
         weights_layer.outputs);
+    tensor_guid_t logit_tensor = get_only(linear_operator.outputs);
 
-    RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
-        DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
+    RuntimeArgConfig runtime_arg_config = gpu_make_runtime_arg_config(
+        managed_handle.raw_handle(),
         EnableProfiling::YES,
-        ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}};
+        ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1});
 
+    ForwardTensorSource forward_tensor_source;
     GradientTensorSource gradient_tensor_source;
     OptimizerTensorSource optimizer_tensor_source;
+    LossTensorSource loss_tensor_source;
+
+    auto make_training_backing = [&](OptimizerAttrs const &optimizer_attrs) {
+      TrainingComputationGraph training_computation_graph =
+          generate_training_computation_graph(computation_graph,
+                                              optimizer_attrs,
+                                              logit_tensor,
+                                              forward_tensor_source,
+                                              gradient_tensor_source,
+                                              optimizer_tensor_source,
+                                              loss_tensor_source);
+
+      return make_local_training_backing_for_computation_graph(
+          /*allocator=*/allocator,
+          /*preallocated_tensors=*/{},
+          /*training_computation_graph=*/training_computation_graph,
+          /*runtime_arg_config=*/runtime_arg_config,
+          /*optimizer_attrs=*/optimizer_attrs);
+    };
 
     SUBCASE("SGDOptimizerAttrs") {
       SUBCASE("momentum=0") {
@@ -72,39 +97,27 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
                                              /*momentum=*/0.0f,
                                              /*nesterov=*/false,
                                              /*weight_decay=*/0.001}};
-        LocalTrainingBacking local_training_backing =
-            LocalTrainingBacking{allocator,
-                                 allocated_tensors,
-                                 gradient_tensor_source,
-                                 optimizer_tensor_source,
-                                 computation_graph,
-                                 runtime_arg_config,
-                                 optimizer_attrs};
-        execute_update(local_training_backing,
+
+        execute_update(make_training_backing(optimizer_attrs),
                        linear_operator.layer,
                        optimizer_attrs,
                        allocator);
       }
+
       SUBCASE("momentum=0.9") {
         OptimizerAttrs optimizer_attrs =
             OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
                                              /*momentum=*/0.9,
                                              /*nesterov=*/false,
                                              /*weight_decay=*/0.001}};
-        LocalTrainingBacking local_training_backing =
-            LocalTrainingBacking{allocator,
-                                 allocated_tensors,
-                                 gradient_tensor_source,
-                                 optimizer_tensor_source,
-                                 computation_graph,
-                                 runtime_arg_config,
-                                 optimizer_attrs};
-        execute_update(local_training_backing,
+
+        execute_update(make_training_backing(optimizer_attrs),
                        linear_operator.layer,
                        optimizer_attrs,
                        allocator);
       }
     }
+
     SUBCASE("AdamOptimizerAttrs") {
       OptimizerAttrs optimizer_attrs =
           OptimizerAttrs{AdamOptimizerAttrs{/*alpha=*/0.001,
@@ -115,15 +128,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
                                             /*beta_t=*/0.9,
                                             /*beta2_t=*/0.999,
                                             /*epsilon=*/1e-8}};
-      LocalTrainingBacking local_training_backing =
-          LocalTrainingBacking{allocator,
-                               allocated_tensors,
-                               gradient_tensor_source,
-                               optimizer_tensor_source,
-                               computation_graph,
-                               runtime_arg_config,
-                               optimizer_attrs};
-      execute_update(local_training_backing,
+      execute_update(make_training_backing(optimizer_attrs),
                      linear_operator.layer,
                      optimizer_attrs,
                      allocator);
diff --git a/lib/local-execution/test/src/test_loss_functions.cc b/lib/local-execution/test/src/local-execution/loss_functions.cc
similarity index 54%
rename from lib/local-execution/test/src/test_loss_functions.cc
rename to lib/local-execution/test/src/local-execution/loss_functions.cc
index d741d4d8d4..e5fffb980c 100644
--- a/lib/local-execution/test/src/test_loss_functions.cc
+++ b/lib/local-execution/test/src/local-execution/loss_functions.cc
@@ -1,14 +1,19 @@
 #include "doctest/doctest.h"
+#include "internal/test_utils.h"
 #include "kernels/local_cuda_allocator.h"
 #include "kernels/managed_ff_stream.h"
 #include "kernels/managed_per_device_ff_handle.h"
-#include "local-execution/allocated_tensors.h"
 #include "local-execution/local_training_backing.h"
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
 #include "pcg/computation_graph.h"
 #include "pcg/computation_graph_builder.h"
 #include "pcg/optimizer_attrs.dtg.h"
-#include "test_utils.h"
+#include "task-spec/forward_tensor_source.h"
+#include "task-spec/gradient_tensor_source.h"
+#include "task-spec/loss_tensor_source.h"
+#include "task-spec/optimizer_tensor_source.h"
+#include "task-spec/runtime_arg_config.h"
+#include "task-spec/training_computation_graph.h"
 #include "utils/containers/get_only.h"
 
 using namespace ::FlexFlow;
@@ -23,34 +28,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    // allocate label tensors
-    LossTensorSource loss_tensor_source;
-    loss_tensor_t label_for_nonconfigurable_loss_attrs =
-        loss_tensor_source.new_loss_tensor();
-    loss_tensor_t label_for_sparse_cce_loss_attrs =
-        loss_tensor_source.new_loss_tensor();
-
     positive_int batch_size = 10_p;
     positive_int data_dim = 16_p;
     positive_int output_dim = 32_p;
 
-    TensorShape output_tensor_shape = TensorShape{
-        TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
-    TensorShape reduced_tensor_shape =
-        TensorShape{TensorDims{FFOrdered{batch_size, 1_p}}, DataType::FLOAT};
-
-    GenericTensorAccessorW label_for_nonconfigurable_loss_attrs_backing =
-        allocator.allocate_tensor(output_tensor_shape);
-    GenericTensorAccessorW label_for_sparse_cce_loss_attrs_backing =
-        allocator.allocate_tensor(reduced_tensor_shape);
-    AllocatedTensors allocated_tensors = AllocatedTensors{
-        {{TensorTypeVariant{label_for_nonconfigurable_loss_attrs},
-          label_for_nonconfigurable_loss_attrs_backing},
-         {TensorTypeVariant{label_for_sparse_cce_loss_attrs},
-          label_for_sparse_cce_loss_attrs_backing}},
-        {},
-        {}};
-
     // construct computation graph
     ComputationGraph computation_graph = make_empty_computation_graph();
 
@@ -83,60 +64,92 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
         weights_layer.outputs);
     tensor_guid_t logit_tensor = get_only(linear_operator.outputs);
 
-    RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
-        DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
+    RuntimeArgConfig runtime_arg_config = gpu_make_runtime_arg_config(
+        managed_handle.raw_handle(),
         EnableProfiling::YES,
-        ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}};
-
-    // initialize training backing
+        ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1});
+
+    OptimizerAttrs optimizer_attrs = OptimizerAttrs{
+        SGDOptimizerAttrs{
+            /*lr=*/0.0,
+            /*momentum=*/0.0,
+            /*nesterov=*/false,
+            /*weight_decay=*/0.0,
+        },
+    };
+
+    ForwardTensorSource forward_tensor_source;
     GradientTensorSource gradient_tensor_source;
-    LocalTrainingBacking local_training_backing =
-        LocalTrainingBacking{allocator,
-                             allocated_tensors,
-                             gradient_tensor_source,
-                             computation_graph,
-                             runtime_arg_config};
+    OptimizerTensorSource optimizer_tensor_source;
+    LossTensorSource loss_tensor_source;
+
+    TrainingComputationGraph training_computation_graph =
+        generate_training_computation_graph(computation_graph,
+                                            optimizer_attrs,
+                                            logit_tensor,
+                                            forward_tensor_source,
+                                            gradient_tensor_source,
+                                            optimizer_tensor_source,
+                                            loss_tensor_source);
+
+    auto make_training_backing = [&](TensorShape const &label_tensor_shape) {
+      GenericTensorAccessorW label_tensor_accessor =
+          allocator.allocate_tensor(label_tensor_shape);
+
+      return make_local_training_backing_for_computation_graph(
+          /*allocator=*/allocator,
+          /*preallocated_tensors=*/
+          {
+              {
+                  training_tensor_guid_t{
+                      training_computation_graph.label_tensor},
+                  label_tensor_accessor,
+              },
+          },
+          /*training_computation_graph=*/training_computation_graph,
+          /*runtime_arg_config=*/runtime_arg_config,
+          /*optimizer_attrs=*/optimizer_attrs);
+    };
 
     SUBCASE("SparseCategoricalCrossEntropyLossAttrs") {
+      TensorShape label_tensor_shape =
+          TensorShape{TensorDims{FFOrdered{batch_size, 1_p}}, DataType::FLOAT};
+
+      LocalTrainingBacking local_training_backing =
+          make_training_backing(label_tensor_shape);
+
       LossAttrs loss_attrs = LossAttrs{
           SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}};
 
-      compute_loss(local_training_backing,
-                   loss_attrs,
-                   logit_tensor,
-                   label_for_sparse_cce_loss_attrs,
-                   allocator);
+      compute_loss(local_training_backing, loss_attrs, allocator);
     }
 
     SUBCASE("NonconfigurableLossAttrs") {
+      TensorShape label_tensor_shape = TensorShape{
+          TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
+
+      LocalTrainingBacking local_training_backing =
+          make_training_backing(label_tensor_shape);
+
       SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") {
         LossAttrs loss_attrs = LossAttrs{
             NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
-        compute_loss(local_training_backing,
-                     loss_attrs,
-                     logit_tensor,
-                     label_for_nonconfigurable_loss_attrs,
-                     allocator);
+
+        compute_loss(local_training_backing, loss_attrs, allocator);
       }
 
       SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") {
         LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{
             LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}};
-        compute_loss(local_training_backing,
-                     loss_attrs,
-                     logit_tensor,
-                     label_for_nonconfigurable_loss_attrs,
-                     allocator);
+
+        compute_loss(local_training_backing, loss_attrs, allocator);
       }
 
       SUBCASE("LossFunction::IDENTITY") {
         LossAttrs loss_attrs =
             LossAttrs{NonconfigurableLossAttrs{LossFunction::IDENTITY}};
-        compute_loss(local_training_backing,
-                     loss_attrs,
-                     logit_tensor,
-                     label_for_nonconfigurable_loss_attrs,
-                     allocator);
+
+        compute_loss(local_training_backing, loss_attrs, allocator);
       }
     }
   }
diff --git a/lib/local-execution/test/src/test_allocated_tensors.cc b/lib/local-execution/test/src/test_allocated_tensors.cc
deleted file mode 100644
index 3242ca79ad..0000000000
--- a/lib/local-execution/test/src/test_allocated_tensors.cc
+++ /dev/null
@@ -1,226 +0,0 @@
-#include "kernels/local_cpu_allocator.h"
-#include "local-execution/allocated_tensors.h"
-#include "local-execution/gradient_tensor_source.h"
-#include "local-execution/loss_tensor_source.h"
-#include "local-execution/optimizer_tensor_source.h"
-#include "pcg/computation_graph.dtg.h"
-#include "test/utils/doctest/fmt/pair.h"
-#include "test/utils/doctest/fmt/unordered_map.h"
-#include "test/utils/doctest/fmt/variant.h"
-#include "test/utils/doctest/fmt/vector.h"
-#include "test_utils.h"
-#include <doctest/doctest.h>
-
-using namespace ::FlexFlow;
-
-TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("AllocatedTensors") {
-    MockTensorGuidSource tensor_guid_source;
-    GradientTensorSource gradient_tensor_source;
-    OptimizerTensorSource optimizer_tensor_source;
-    LossTensorSource loss_tensor_source;
-
-    Allocator allocator = create_local_cpu_memory_allocator();
-
-    tensor_guid_t mock_tensor_1 = tensor_guid_source.new_mock_tensor_guid();
-    tensor_guid_t mock_tensor_2 = tensor_guid_source.new_mock_tensor_guid();
-    tensor_guid_t mock_tensor_3_with_grad =
-        tensor_guid_source.new_mock_tensor_guid();
-    tensor_guid_t dangling_tensor = tensor_guid_source.new_mock_tensor_guid();
-
-    TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{
-        TensorShape{TensorDims{FFOrdered{16_p, 10_p}}, DataType::FLOAT},
-        CreateGrad::NO};
-    TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{
-        TensorShape{TensorDims{FFOrdered{16_p, 20_p}}, DataType::FLOAT},
-        CreateGrad::NO};
-    TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{
-        TensorShape{TensorDims{FFOrdered{16_p, 30_p}}, DataType::FLOAT},
-        CreateGrad::YES};
-
-    GenericTensorAccessorW tensor_backing_1 =
-        allocator.allocate_tensor(tensor_attrs_1_no_grad.shape);
-    GenericTensorAccessorW tensor_backing_2 =
-        allocator.allocate_tensor(tensor_attrs_2_no_grad.shape);
-    GenericTensorAccessorW tensor_backing_3 =
-        allocator.allocate_tensor(tensor_attrs_3_with_grad.shape);
-
-    std::unordered_map<tensor_guid_t, TensorAttrs> tensor_attrs_mapping = {
-        {mock_tensor_1, tensor_attrs_1_no_grad},
-        {mock_tensor_2, tensor_attrs_2_no_grad},
-        {mock_tensor_3_with_grad, tensor_attrs_3_with_grad},
-    };
-
-    SUBCASE("Trivial tensors") {
-      SUBCASE("Empty") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{{}, {}, {}};
-        bool result = are_allocated_tensors_valid(allocated_tensors,
-                                                  tensor_attrs_mapping);
-        CHECK(result == true);
-      }
-
-      SUBCASE("Loss tensor") {
-        loss_tensor_t loss_tensor = loss_tensor_source.new_loss_tensor();
-        AllocatedTensors allocated_tensors = AllocatedTensors{
-            {{TensorTypeVariant{loss_tensor}, tensor_backing_1}}, {}, {}};
-        bool result = are_allocated_tensors_valid(allocated_tensors,
-                                                  tensor_attrs_mapping);
-        CHECK(result == true);
-      }
-    }
-
-    SUBCASE("Forward tensors") {
-      SUBCASE("Correct forward tensor") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{
-            {{TensorTypeVariant{mock_tensor_1}, tensor_backing_1}}, {}, {}};
-        bool result = are_allocated_tensors_valid(allocated_tensors,
-                                                  tensor_attrs_mapping);
-        CHECK(result == true);
-      }
-
-      SUBCASE("Incorrect forward tensor") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{
-            {{TensorTypeVariant{mock_tensor_1}, tensor_backing_2}}, {}, {}};
-        bool result = are_allocated_tensors_valid(allocated_tensors,
-                                                  tensor_attrs_mapping);
-        CHECK(result == false);
-      }
-
-      SUBCASE("Dangling tensor guid") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{
-            {
-                {TensorTypeVariant{dangling_tensor}, tensor_backing_1},
-            },
-            {},
-            {}};
-        bool result = are_allocated_tensors_valid(allocated_tensors,
-                                                  tensor_attrs_mapping);
-        CHECK(result == false);
-      }
-    }
-
-    SUBCASE("Gradient tensors") {
-      gradient_tensor_t grad_tensor_3 =
-          gradient_tensor_source.new_gradient_tensor();
-
-      SUBCASE("Gradient tensor") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{
-            {{TensorTypeVariant{grad_tensor_3}, tensor_backing_3}},
-            {{mock_tensor_3_with_grad, grad_tensor_3}},
-            {}};
-        bool result = are_allocated_tensors_valid(allocated_tensors,
-                                                  tensor_attrs_mapping);
-        CHECK(result == true);
-      }
-
-      SUBCASE("Dangling gradient tensor") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{
-            {{TensorTypeVariant{grad_tensor_3}, tensor_backing_3}}, {}, {}};
-        bool result = are_allocated_tensors_valid(allocated_tensors,
-                                                  tensor_attrs_mapping);
-        CHECK(result == false);
-      }
-
-      SUBCASE("Dangling gradient tensor in mapping") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{
-            {}, {{mock_tensor_3_with_grad, grad_tensor_3}}, {}};
-        bool result = are_allocated_tensors_valid(allocated_tensors,
-                                                  tensor_attrs_mapping);
-        CHECK(result == false);
-      }
-
-      SUBCASE("Gradient allocated for forward tensor without gradient") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{
-            {{TensorTypeVariant{grad_tensor_3}, tensor_backing_3}},
-            {{mock_tensor_2, grad_tensor_3}},
-            {}};
-        bool result = are_allocated_tensors_valid(allocated_tensors,
-                                                  tensor_attrs_mapping);
-        CHECK(result == false);
-      }
-
-      SUBCASE("Gradient tensor with wrong shape") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{
-            {{TensorTypeVariant{grad_tensor_3}, tensor_backing_2}},
-            {{mock_tensor_3_with_grad, grad_tensor_3}},
-            {}};
-        bool result = are_allocated_tensors_valid(allocated_tensors,
-                                                  tensor_attrs_mapping);
-        CHECK(result == false);
-      }
-
-      SUBCASE("Gradient tensor with dangling tensor guid") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{
-            {{TensorTypeVariant{grad_tensor_3}, tensor_backing_3}},
-            {{dangling_tensor, grad_tensor_3}},
-            {}};
-        bool result = are_allocated_tensors_valid(allocated_tensors,
-                                                  tensor_attrs_mapping);
-        CHECK(result == false);
-      }
-    }
-
-    SUBCASE("Optimizer tensors") {
-      optimizer_tensor_t optimizer_tensor_3 =
-          optimizer_tensor_source.new_optimizer_tensor();
-
-      SUBCASE("Optimizer tensor") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{
-            {{TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3}},
-            {},
-            {{mock_tensor_3_with_grad, {optimizer_tensor_3}}}};
-        bool result = are_allocated_tensors_valid(allocated_tensors,
-                                                  tensor_attrs_mapping);
-        CHECK(result == true);
-      }
-
-      SUBCASE("Dangling optimizer tensor") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{
-            {{TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3}},
-            {},
-            {}};
-        bool result = are_allocated_tensors_valid(allocated_tensors,
-                                                  tensor_attrs_mapping);
-        CHECK(result == false);
-      }
-
-      SUBCASE("Dangling optimizer tensor in mapping") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{
-            {}, {}, {{mock_tensor_3_with_grad, {optimizer_tensor_3}}}};
-        bool result = are_allocated_tensors_valid(allocated_tensors,
-                                                  tensor_attrs_mapping);
-        CHECK(result == false);
-      }
-
-      SUBCASE("Optimizer allocated for forward tensor without gradient") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{
-            {{TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3}},
-            {},
-            {{mock_tensor_2, {optimizer_tensor_3}}}};
-        bool result = are_allocated_tensors_valid(allocated_tensors,
-                                                  tensor_attrs_mapping);
-        CHECK(result == false);
-      }
-
-      SUBCASE("Optimizer tensor with wrong shape") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{
-            {{TensorTypeVariant{optimizer_tensor_3}, tensor_backing_2}},
-            {},
-            {{mock_tensor_3_with_grad, {optimizer_tensor_3}}}};
-        bool result = are_allocated_tensors_valid(allocated_tensors,
-                                                  tensor_attrs_mapping);
-        CHECK(result == false);
-      }
-
-      SUBCASE("Optimizer tensor with dangling tensor guid") {
-        AllocatedTensors allocated_tensors = AllocatedTensors{
-            {{TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3}},
-            {},
-            {{dangling_tensor, {optimizer_tensor_3}}}};
-        bool result = are_allocated_tensors_valid(allocated_tensors,
-                                                  tensor_attrs_mapping);
-        CHECK(result == false);
-      }
-    }
-  }
-}
diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc
index f1c83e76a0..f8d34fc5ff 100644
--- a/lib/local-execution/test/src/test_e2e.cc
+++ b/lib/local-execution/test/src/test_e2e.cc
@@ -1,15 +1,25 @@
+#include "internal/test_utils.h"
 #include "kernels/compare_tensor_accessors.h"
+#include "kernels/copy_tensor_accessor.h"
 #include "kernels/format_accessor_contents.h"
+#include "kernels/local_cpu_allocator.h"
+#include "kernels/local_cuda_allocator.h"
+#include "kernels/managed_ff_stream.h"
+#include "kernels/managed_per_device_ff_handle.h"
 #include "kernels/tensor_accessor_reductions.h"
-#include "kernels/test_utils.h"
-#include "local-execution/allocated_tensors.h"
 #include "local-execution/local_training_backing.h"
 #include "local-execution/model_training_instance.h"
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
 #include "pcg/computation_graph.h"
 #include "pcg/computation_graph_builder.h"
 #include "pcg/optimizer_attrs.dtg.h"
-#include "test_utils.h"
+#include "task-spec/forward_tensor_source.h"
+#include "task-spec/gradient_tensor_source.h"
+#include "task-spec/loss_tensor_source.h"
+#include "task-spec/optimizer_tensor_source.h"
+#include "task-spec/runtime_arg_config.h"
+#include "task-spec/training_computation_graph.h"
+#include "test/utils/doctest/check_kv.h"
 #include "utils/containers/get_only.h"
 #include <doctest/doctest.h>
 
@@ -23,8 +33,139 @@ bool did_loss_decrease(GenericTensorAccessorR const &first_epoch,
       compare_tensor_accessors_le(last_epoch, first_epoch, cpu_allocator));
 }
 
-TEST_SUITE(FF_CUDA_TEST_SUITE) {
+TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("LocalBackend e2e Training") {
+    Allocator allocator = create_local_cpu_memory_allocator();
+
+    positive_int batch_size = 10_p;
+    positive_int data_dim = 16_p;
+    positive_int hidden_dim = 32_p;
+    positive_int output_dim = 1_p;
+
+    TensorShape output_tensor_shape = TensorShape{
+        TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
+
+    GenericTensorAccessorW label_tensor_backing =
+        allocator.allocate_tensor(output_tensor_shape);
+
+    // construct computation graph
+    ComputationGraph computation_graph = make_empty_computation_graph();
+
+    TensorShape input_tensor_shape = TensorShape{
+        TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
+
+    TensorShape weight_shape_1 = TensorShape{
+        TensorDims{FFOrdered{hidden_dim, data_dim}}, DataType::FLOAT};
+    TensorShape weight_shape_2 = TensorShape{
+        TensorDims{FFOrdered{output_dim, hidden_dim}}, DataType::FLOAT};
+
+    LayerAddedResult inputs_layer =
+        add_input_layer_with_grad(computation_graph, input_tensor_shape);
+
+    LayerAddedResult weights_layer_1 = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
+                       weight_shape_1, InitializerAttrs{GlorotNormalAttrs{0}}}},
+                   std::nullopt},
+        {},
+        {});
+
+    LayerAddedResult weights_layer_2 = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
+                       weight_shape_2, InitializerAttrs{GlorotNormalAttrs{0}}}},
+                   std::nullopt},
+        {},
+        {});
+
+    LayerAddedResult linear_operator_1 = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{hidden_dim,
+                                                       /*use_bias=*/false,
+                                                       DataType::FLOAT,
+                                                       Activation::RELU,
+                                                       std::nullopt}},
+                   std::nullopt},
+        inputs_layer.outputs,
+        weights_layer_1.outputs);
+
+    LayerAddedResult linear_operator_2 = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim,
+                                                       /*use_bias=*/false,
+                                                       DataType::FLOAT,
+                                                       Activation::RELU,
+                                                       std::nullopt}},
+                   std::nullopt},
+        linear_operator_1.outputs,
+        weights_layer_2.outputs);
+
+    tensor_guid_t logit_tensor = get_only(linear_operator_2.outputs);
+
+    RuntimeArgConfig runtime_arg_config = cpu_make_runtime_arg_config(
+        EnableProfiling::YES,
+        ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1});
+
+    // initialize training backing
+    LossAttrs loss_attrs = LossAttrs{
+        NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
+    OptimizerAttrs optimizer_attrs =
+        OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
+                                         /*momentum=*/0.9,
+                                         /*nesterov=*/false,
+                                         /*weight_decay=*/0.001}};
+
+    ForwardTensorSource forward_tensor_source;
+    GradientTensorSource gradient_tensor_source;
+    OptimizerTensorSource optimizer_tensor_source;
+    LossTensorSource loss_tensor_source;
+
+    TrainingComputationGraph training_computation_graph =
+        generate_training_computation_graph(computation_graph,
+                                            optimizer_attrs,
+                                            logit_tensor,
+                                            forward_tensor_source,
+                                            gradient_tensor_source,
+                                            optimizer_tensor_source,
+                                            loss_tensor_source);
+
+    LocalTrainingBacking local_training_backing =
+        make_local_training_backing_for_computation_graph(
+            /*allocator=*/allocator,
+            /*preallocated_tensors=*/{},
+            /*training_computation_graph=*/training_computation_graph,
+            /*runtime_arg_config=*/runtime_arg_config,
+            /*optimizer_attrs=*/optimizer_attrs);
+
+    // begin training loop
+    ModelTrainingInstance model_training_instance = ModelTrainingInstance{
+        allocator, local_training_backing, loss_attrs, optimizer_attrs};
+
+    int num_epochs = 5;
+    std::vector<GenericTensorAccessorR> loss_values;
+
+    for (int i = 0; i < num_epochs; i++) {
+      model_training_instance.forward();
+      model_training_instance.backward();
+      model_training_instance.update();
+      loss_values.push_back(copy_tensor_accessor_r(
+          model_training_instance.get_loss_tensor_accessor(), allocator));
+    }
+
+    // Assert that each sample in the batch has a lower loss in last epoch than
+    // the first epoch
+    GenericTensorAccessorR first_epoch_loss = loss_values.at(0);
+    GenericTensorAccessorR last_epoch_loss = loss_values.back();
+    CHECK_MESSAGE(did_loss_decrease(first_epoch_loss, last_epoch_loss),
+                  check_kv("first_epoch_loss",
+                           format_accessor_r_contents(first_epoch_loss)),
+                  check_kv("last_epoch_loss",
+                           format_accessor_r_contents(last_epoch_loss)));
+  }
+}
+
+TEST_SUITE(FF_CUDA_TEST_SUITE) {
+  TEST_CASE("LocalBackend e2e Training (CUDA)") {
     // initialize runtime
     ManagedFFStream managed_stream{};
     ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
@@ -33,42 +174,30 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    // allocate label tensors
-    LossTensorSource loss_tensor_source;
-    loss_tensor_t label_tensor = loss_tensor_source.new_loss_tensor();
-
     positive_int batch_size = 10_p;
     positive_int data_dim = 16_p;
     positive_int hidden_dim = 32_p;
     positive_int output_dim = 1_p;
 
-    TensorShape input_tensor_shape = TensorShape{
-        TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
     TensorShape output_tensor_shape = TensorShape{
         TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
 
-    GenericTensorAccessorW label_tensor_backing = create_random_filled_accessor_w(
-        output_tensor_shape, allocator);
+    GenericTensorAccessorW label_tensor_backing =
+        allocator.allocate_tensor(output_tensor_shape);
 
     // construct computation graph
     ComputationGraph computation_graph = make_empty_computation_graph();
 
+    TensorShape input_tensor_shape = TensorShape{
+        TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
 
     TensorShape weight_shape_1 = TensorShape{
         TensorDims{FFOrdered{data_dim, hidden_dim}}, DataType::FLOAT};
     TensorShape weight_shape_2 = TensorShape{
         TensorDims{FFOrdered{hidden_dim, output_dim}}, DataType::FLOAT};
 
-    GenericTensorAccessorW weight_1_backing = create_random_filled_accessor_w(
-        weight_shape_1, allocator);
-    GenericTensorAccessorW weight_2_backing = create_random_filled_accessor_w(
-        weight_shape_2, allocator);
-
     LayerAddedResult inputs_layer =
         add_input_layer_with_grad(computation_graph, input_tensor_shape);
-    tensor_guid_t input_tensor_guid = get_only(inputs_layer.outputs);
-    GenericTensorAccessorW input_tensor_backing = create_random_filled_accessor_w(
-        input_tensor_shape, allocator);
 
     LayerAddedResult weights_layer_1 = add_layer(
         computation_graph,
@@ -77,7 +206,6 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
                    std::nullopt},
         {},
         {});
-    tensor_guid_t weight_1_tensor_guid = get_only(weights_layer_1.outputs);
 
     LayerAddedResult weights_layer_2 = add_layer(
         computation_graph,
@@ -86,7 +214,6 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
                    std::nullopt},
         {},
         {});
-    tensor_guid_t weight_2_tensor_guid = get_only(weights_layer_2.outputs);
 
     LayerAddedResult linear_operator_1 = add_layer(
         computation_graph,
@@ -112,51 +239,55 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
 
     tensor_guid_t logit_tensor = get_only(linear_operator_2.outputs);
 
-    RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
-        DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
+    RuntimeArgConfig runtime_arg_config = gpu_make_runtime_arg_config(
+        managed_handle.raw_handle(),
         EnableProfiling::YES,
-        ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}};
+        ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1});
 
     // initialize training backing
     LossAttrs loss_attrs = LossAttrs{
         NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
-    OptimizerAttrs optimizer_attrs =
-        OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
-                                         /*momentum=*/0.9,
-                                         /*nesterov=*/false,
-                                         /*weight_decay=*/0.001}};
+    OptimizerAttrs optimizer_attrs = OptimizerAttrs{
+        SGDOptimizerAttrs{
+            /*lr=*/0.001,
+            /*momentum=*/0.9,
+            /*nesterov=*/false,
+            /*weight_decay=*/0.001,
+        },
+    };
 
+    ForwardTensorSource forward_tensor_source;
     GradientTensorSource gradient_tensor_source;
     OptimizerTensorSource optimizer_tensor_source;
+    LossTensorSource loss_tensor_source;
 
-    AllocatedTensors allocated_tensors = AllocatedTensors{
-        /*tensor_type_backings=*/{
-            {TensorTypeVariant{label_tensor}, label_tensor_backing},
-            {TensorTypeVariant{input_tensor_guid}, input_tensor_backing},
-            {TensorTypeVariant{weight_1_tensor_guid}, weight_1_backing},
-            {TensorTypeVariant{weight_2_tensor_guid}, weight_2_backing},
-        },
-        /*gradient_mapping=*/{},
-        /*optimizer_mapping*/ {},
-    };
+    TrainingComputationGraph training_computation_graph =
+        generate_training_computation_graph(computation_graph,
+                                            optimizer_attrs,
+                                            logit_tensor,
+                                            forward_tensor_source,
+                                            gradient_tensor_source,
+                                            optimizer_tensor_source,
+                                            loss_tensor_source);
 
     LocalTrainingBacking local_training_backing =
-        LocalTrainingBacking{allocator,
-                             allocated_tensors,
-                             gradient_tensor_source,
-                             optimizer_tensor_source,
-                             computation_graph,
-                             runtime_arg_config,
-                             optimizer_attrs};
+        make_local_training_backing_for_computation_graph(
+            /*allocator=*/allocator,
+            /*preallocated_tensors=*/
+            {
+                {
+                    training_tensor_guid_t{
+                        training_computation_graph.label_tensor},
+                    label_tensor_backing,
+                },
+            },
+            /*training_computation_graph=*/training_computation_graph,
+            /*runtime_arg_config=*/runtime_arg_config,
+            /*optimizer_attrs=*/optimizer_attrs);
 
     // begin training loop
-    ModelTrainingInstance model_training_instance =
-        ModelTrainingInstance{allocator,
-                              local_training_backing,
-                              logit_tensor,
-                              label_tensor,
-                              loss_attrs,
-                              optimizer_attrs};
+    ModelTrainingInstance model_training_instance = ModelTrainingInstance{
+        allocator, local_training_backing, loss_attrs, optimizer_attrs};
 
     Allocator cpu_allocator = create_local_cpu_memory_allocator();
 
@@ -174,9 +305,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) {
     // Assert that each sample in the batch has a lower loss in last epoch than
     // the first epoch
     GenericTensorAccessorR first_epoch_loss = loss_values.at(0);
-    
     GenericTensorAccessorR last_epoch = loss_values.back();
-
     CHECK(did_loss_decrease(first_epoch_loss, last_epoch));
   }
 }
diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc
deleted file mode 100644
index 42b88aa6bc..0000000000
--- a/lib/local-execution/test/src/test_local_cost_estimator.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-#include "doctest/doctest.h"
-#include "kernels/local_cuda_allocator.h"
-#include "kernels/managed_per_device_ff_handle.h"
-#include "local-execution/local_cost_estimator.h"
-#include "op-attrs/ops/attention.h"
-#include "op-attrs/parallel_tensor_shape.h"
-#include "pcg/computation_graph_builder.h"
-#include "test_utils.h"
-
-using namespace ::FlexFlow;
-
-TEST_SUITE(FF_CUDA_TEST_SUITE) {
-  TEST_CASE("LocalCostEstimator") {
-    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-        /*workSpaceSize=*/1024 * 1024,
-        /*allowTensorOpMathConversion=*/true);
-
-    RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
-        DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
-        EnableProfiling::YES,
-        ProfilingSettings{/*warmup_iters=*/0,
-                          /*measure_iters=*/1}};
-
-    LocalCostEstimator cost_estimator = LocalCostEstimator{runtime_arg_config};
-
-    SUBCASE("Estimate cost -- Attention Op") {
-      positive_int embed_dim = 32_p;
-      positive_int num_heads = 10_p;
-      MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{
-          /*embed_dim=*/embed_dim,
-          /*num_heads=*/num_heads,
-          /*kdim=*/embed_dim,
-          /*vdim=*/embed_dim,
-          /*dropout=*/0.0,
-          /*bias=*/false,
-          /*add_bias_kv=*/false,
-          /*add_zero_attn=*/false,
-      };
-
-      positive_int batch_size = 40_p;
-      positive_int seq_len = 48_p;
-      positive_int feature_size = 36_p;
-
-      DataType dtype = DataType::FLOAT;
-      ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{
-          TensorDims{
-              FFOrdered<positive_int>{batch_size, seq_len, feature_size}},
-          DataType::FLOAT,
-      });
-
-      ParallelTensorShape weights_shape = throw_if_unexpected(
-          get_weights_shape(attrs, inputs_shape, inputs_shape, inputs_shape));
-      ParallelTensorAttrs weight_attrs =
-          ParallelTensorAttrs{weights_shape, CreateGrad::YES};
-
-      ParallelTensorShape output_shape = throw_if_unexpected(
-          get_output_shape(attrs, inputs_shape, inputs_shape, inputs_shape));
-      ParallelTensorAttrs output_attrs =
-          ParallelTensorAttrs{output_shape, CreateGrad::YES};
-
-      CostDetails result = cost_estimator.estimate_cost(
-          PCGOperatorAttrs{attrs},
-          std::vector<ParallelTensorShape>{
-              inputs_shape, inputs_shape, inputs_shape},
-          std::vector<ParallelTensorAttrs>{weight_attrs},
-          std::vector<ParallelTensorAttrs>{output_attrs},
-          make_1d_machine_view(
-              MachineSpaceCoordinate{0_n, 0_n, DeviceType::GPU},
-              MachineSpecificationDimension::INTRA_NODE,
-              stride_t{1_p}));
-
-      CHECK(result.total_elapsed_time > 0);
-      CHECK(result.total_mem_usage > 0);
-    }
-  }
-}
diff --git a/lib/local-execution/test/src/test_local_tensor_backing.cc b/lib/local-execution/test/src/test_local_tensor_backing.cc
deleted file mode 100644
index bba0bd28ce..0000000000
--- a/lib/local-execution/test/src/test_local_tensor_backing.cc
+++ /dev/null
@@ -1,146 +0,0 @@
-#include "kernels/local_cpu_allocator.h"
-#include "local-execution/local_tensor_backing.h"
-#include "test_utils.h"
-#include "utils/containers/keys.h"
-#include <doctest/doctest.h>
-
-using namespace ::FlexFlow;
-
-bool is_shape_and_dtype_equal_for_tensor_backings(
-    std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> const &m1,
-    std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> const &m2) {
-  if (keys(m1) == keys(m2)) {
-    for (std::pair<TensorTypeVariant, GenericTensorAccessorW> const
-             &tensor_type_backing : m1) {
-      if (is_shape_and_dtype_equal(tensor_type_backing.second,
-                                   m2.at(tensor_type_backing.first))) {
-        continue;
-      } else {
-        return false;
-      }
-    }
-    return true;
-  } else {
-    return false;
-  }
-}
-
-TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("LocalTensorBacking") {
-    MockTensorGuidSource tensor_guid_source;
-    GradientTensorSource gradient_tensor_source;
-    OptimizerTensorSource optimizer_tensor_source;
-    LossTensorSource loss_tensor_source;
-
-    SUBCASE("merge_optimizer_mappings") {
-      SUBCASE("Both empty") {
-        std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
-            result = merge_optimizer_mappings({}, {});
-        std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
-            correct = {};
-        CHECK(result == correct);
-      }
-
-      tensor_guid_t allocated_tensor_guid =
-          tensor_guid_source.new_mock_tensor_guid();
-      optimizer_tensor_t optimizer_tensor_1 =
-          optimizer_tensor_source.new_optimizer_tensor();
-      optimizer_tensor_t optimizer_tensor_2 =
-          optimizer_tensor_source.new_optimizer_tensor();
-      std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
-          correct = {{allocated_tensor_guid,
-                      {optimizer_tensor_1, optimizer_tensor_2}}};
-      SUBCASE("Unallocated is empty") {
-        std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
-            allocated = {{allocated_tensor_guid,
-                          {optimizer_tensor_1, optimizer_tensor_2}}};
-        std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
-            result = merge_optimizer_mappings(allocated, {});
-        CHECK(result == correct);
-      }
-      SUBCASE("Allocated is empty") {
-        std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
-            unallocated = {{allocated_tensor_guid,
-                            {optimizer_tensor_1, optimizer_tensor_2}}};
-        std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
-            result = merge_optimizer_mappings({}, unallocated);
-        CHECK(result == correct);
-      }
-
-      SUBCASE("Both are partially allocated") {
-        std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
-            allocated = {{allocated_tensor_guid, {optimizer_tensor_1}}};
-        std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
-            unallocated = {{allocated_tensor_guid, {optimizer_tensor_2}}};
-        std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
-            result = merge_optimizer_mappings(allocated, unallocated);
-        CHECK(result == correct);
-      }
-    }
-
-    SUBCASE("get_tensor_backings") {
-      Allocator allocator = create_local_cpu_memory_allocator();
-      SUBCASE("Both are empty") {
-        std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> result =
-            get_tensor_backings({}, {}, allocator);
-        std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> correct =
-            {};
-        CHECK(result == correct);
-      }
-
-      tensor_guid_t allocated_tensor_guid =
-          tensor_guid_source.new_mock_tensor_guid();
-      tensor_guid_t unallocated_tensor_guid =
-          tensor_guid_source.new_mock_tensor_guid();
-
-      TensorAttrs allocated_tensor_attrs = TensorAttrs{
-          TensorShape{TensorDims{FFOrdered{16_p, 10_p}}, DataType::FLOAT},
-          CreateGrad::NO};
-      TensorAttrs unallocated_tensor_attrs = TensorAttrs{
-          TensorShape{TensorDims{FFOrdered{16_p, 20_p}}, DataType::FLOAT},
-          CreateGrad::YES};
-
-      GenericTensorAccessorW allocated_tensor_backing =
-          allocator.allocate_tensor(allocated_tensor_attrs.shape);
-      GenericTensorAccessorW unallocated_tensor_backing =
-          allocator.allocate_tensor(unallocated_tensor_attrs.shape);
-
-      SUBCASE("Unallocated is empty") {
-        std::unordered_map<TensorTypeVariant, GenericTensorAccessorW>
-            allocated = {{TensorTypeVariant{allocated_tensor_guid},
-                          allocated_tensor_backing}};
-        std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> result =
-            get_tensor_backings(allocated, {}, allocator);
-        CHECK(result == allocated);
-      }
-      SUBCASE("Allocated is empty") {
-        std::unordered_map<TensorTypeVariant, TensorShape> unallocated = {
-            {TensorTypeVariant{unallocated_tensor_guid},
-             unallocated_tensor_attrs.shape}};
-        std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> result =
-            get_tensor_backings({}, unallocated, allocator);
-        std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> correct =
-            {{TensorTypeVariant{unallocated_tensor_guid},
-              unallocated_tensor_backing}};
-        CHECK(is_shape_and_dtype_equal_for_tensor_backings(result, correct));
-      }
-      SUBCASE("Both are partially allocated") {
-        std::unordered_map<TensorTypeVariant, GenericTensorAccessorW>
-            allocated = {{TensorTypeVariant{allocated_tensor_guid},
-                          allocated_tensor_backing}};
-        std::unordered_map<TensorTypeVariant, TensorShape> unallocated = {
-            {TensorTypeVariant{unallocated_tensor_guid},
-             unallocated_tensor_attrs.shape}};
-
-        std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> result =
-            get_tensor_backings(allocated, unallocated, allocator);
-        std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> correct =
-            {{TensorTypeVariant{allocated_tensor_guid},
-              allocated_tensor_backing},
-             {TensorTypeVariant{unallocated_tensor_guid},
-              unallocated_tensor_backing}};
-        CHECK(is_shape_and_dtype_equal_for_tensor_backings(result, correct));
-      }
-    }
-  }
-}
diff --git a/lib/local-execution/test/src/test_task_registry.cc b/lib/local-execution/test/src/test_task_registry.cc
deleted file mode 100644
index 4bcfa7fe17..0000000000
--- a/lib/local-execution/test/src/test_task_registry.cc
+++ /dev/null
@@ -1,216 +0,0 @@
-#include "doctest/doctest.h"
-#include "kernels/local_cuda_allocator.h"
-#include "local-execution/local_cost_estimator.h"
-#include "pcg/computation_graph_builder.h"
-#include "task-spec/task_signature_impl.h"
-#include "utils/fmt/optional.h"
-#include "utils/fmt/unordered_map.h"
-
-using namespace ::FlexFlow;
-
-TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("TaskRegistry") {
-
-    layer_guid_t layer_guid = layer_guid_t{Node{0}};
-    positive_int embed_dim = 32_p;
-    positive_int num_heads = 10_p;
-    ComputationGraphOpAttrs attrs =
-        ComputationGraphOpAttrs{MultiHeadAttentionAttrs{
-            /*embed_dim=*/embed_dim,
-            /*num_heads=*/num_heads,
-            /*kdim=*/embed_dim,
-            /*vdim=*/embed_dim,
-            /*dropout=*/0.0,
-            /*bias=*/true,
-            /*add_bias_kv=*/false,
-            /*add_zero_attn=*/false,
-        }};
-
-    SUBCASE("register single layer") {
-      TaskRegistry task_registry = construct_task_registry(
-          {{layer_guid, LayerAttrs{attrs, std::nullopt}}});
-
-      TaskRegistry correct_task_registry = [&] {
-        std::unordered_map<layer_guid_t, std::optional<task_id_t>>
-            init_task_ids = {{layer_guid, task_id_t::ATTENTION_INIT_TASK_ID}};
-        std::unordered_map<layer_guid_t, std::optional<task_id_t>>
-            fwd_task_ids = {{layer_guid, task_id_t::ATTENTION_FWD_TASK_ID}};
-        std::unordered_map<layer_guid_t, std::optional<task_id_t>>
-            bwd_task_ids = {{layer_guid, task_id_t::ATTENTION_BWD_TASK_ID}};
-        std::unordered_map<task_id_t, TaskSignatureAndImpl> task_mapping = {
-            {task_id_t::ATTENTION_INIT_TASK_ID,
-             get_task_sig_impl(task_id_t::ATTENTION_INIT_TASK_ID)},
-            {task_id_t::ATTENTION_FWD_TASK_ID,
-             get_task_sig_impl(task_id_t::ATTENTION_FWD_TASK_ID)},
-            {task_id_t::ATTENTION_BWD_TASK_ID,
-             get_task_sig_impl(task_id_t::ATTENTION_BWD_TASK_ID)}};
-        return TaskRegistry{
-            init_task_ids, fwd_task_ids, bwd_task_ids, task_mapping};
-      }();
-
-      CHECK(task_registry == correct_task_registry);
-    }
-
-    SUBCASE("multiple layers same task") {
-      layer_guid_t other_layer_guid = layer_guid_t{Node{1}};
-      TaskRegistry task_registry = construct_task_registry({
-          {layer_guid, LayerAttrs{attrs, std::nullopt}},
-          {other_layer_guid, LayerAttrs{attrs, std::nullopt}},
-      });
-
-      SUBCASE("layer to task ids") {
-        std::unordered_map<layer_guid_t, std::optional<task_id_t>> correct = {
-            {layer_guid, task_id_t::ATTENTION_INIT_TASK_ID},
-            {other_layer_guid, task_id_t::ATTENTION_INIT_TASK_ID},
-        };
-        CHECK(correct == task_registry.init_task_ids);
-      }
-
-      SUBCASE("task to signature+impl mapping") {
-        std::unordered_map<task_id_t, TaskSignatureAndImpl>
-            correct_task_mapping = {
-                {task_id_t::ATTENTION_INIT_TASK_ID,
-                 get_task_sig_impl(task_id_t::ATTENTION_INIT_TASK_ID)},
-                {task_id_t::ATTENTION_FWD_TASK_ID,
-                 get_task_sig_impl(task_id_t::ATTENTION_FWD_TASK_ID)},
-                {task_id_t::ATTENTION_BWD_TASK_ID,
-                 get_task_sig_impl(task_id_t::ATTENTION_BWD_TASK_ID)}};
-        CHECK(correct_task_mapping == task_registry.task_mapping);
-      }
-    }
-    SUBCASE("different attrs, still same task fn mapping") {
-      layer_guid_t layer_1 = layer_guid_t{Node{1}};
-      positive_int embed_dim = 100_p;
-      layer_guid_t layer_2 = layer_guid_t{Node{2}};
-      ComputationGraphOpAttrs other_attrs =
-          ComputationGraphOpAttrs{MultiHeadAttentionAttrs{
-              /*embed_dim=*/embed_dim,
-              /*num_heads=*/num_heads,
-              /*kdim=*/embed_dim,
-              /*vdim=*/embed_dim,
-              /*dropout=*/0.0,
-              /*bias=*/true,
-              /*add_bias_kv=*/false,
-              /*add_zero_attn=*/false,
-          }};
-      TaskRegistry task_registry = construct_task_registry({
-          {layer_guid, LayerAttrs{attrs, std::nullopt}},
-          {layer_1, LayerAttrs{attrs, std::nullopt}},
-          {layer_2, LayerAttrs{other_attrs, std::nullopt}},
-      });
-
-      std::unordered_map<task_id_t, TaskSignatureAndImpl> correct_task_mapping =
-          {{task_id_t::ATTENTION_INIT_TASK_ID,
-            get_task_sig_impl(task_id_t::ATTENTION_INIT_TASK_ID)},
-           {task_id_t::ATTENTION_FWD_TASK_ID,
-            get_task_sig_impl(task_id_t::ATTENTION_FWD_TASK_ID)},
-           {task_id_t::ATTENTION_BWD_TASK_ID,
-            get_task_sig_impl(task_id_t::ATTENTION_BWD_TASK_ID)}};
-
-      CHECK(correct_task_mapping == task_registry.task_mapping);
-    }
-
-    SUBCASE("equality") {
-      SUBCASE("different attrs is still equal") {
-        positive_int embed_dim = 100_p;
-        ComputationGraphOpAttrs other_attrs =
-            ComputationGraphOpAttrs{MultiHeadAttentionAttrs{
-                /*embed_dim=*/embed_dim,
-                /*num_heads=*/num_heads,
-                /*kdim=*/embed_dim,
-                /*vdim=*/embed_dim,
-                /*dropout=*/0.0,
-                /*bias=*/true,
-                /*add_bias_kv=*/false,
-                /*add_zero_attn=*/false,
-            }};
-
-        TaskRegistry task_registry = construct_task_registry(
-            {{layer_guid, LayerAttrs{attrs, std::nullopt}}});
-        TaskRegistry other_task_registry = construct_task_registry(
-            {{layer_guid, LayerAttrs{other_attrs, std::nullopt}}});
-
-        CHECK(task_registry == other_task_registry);
-      }
-
-      SUBCASE("different layer_guid is not equal") {
-        TaskRegistry task_registry = construct_task_registry(
-            {{layer_guid, LayerAttrs{attrs, std::nullopt}}});
-        layer_guid_t other_layer_guid = layer_guid_t{Node{1}};
-        TaskRegistry other_task_registry = construct_task_registry(
-            {{other_layer_guid, LayerAttrs{attrs, std::nullopt}}});
-
-        CHECK(task_registry != other_task_registry);
-      }
-    }
-
-    SUBCASE("registry_contains_task_for_layer") {
-      SUBCASE("Task exists") {
-        TaskRegistry task_registry = construct_task_registry({
-            {layer_guid, LayerAttrs{attrs, std::nullopt}},
-        });
-        SUBCASE("Init") {
-          bool result = registry_contains_task_for_layer(
-              task_registry, layer_guid, OpTaskType::INIT);
-          CHECK(result == true);
-        }
-        SUBCASE("Fwd") {
-          bool result = registry_contains_task_for_layer(
-              task_registry, layer_guid, OpTaskType::FWD);
-          CHECK(result == true);
-        }
-        SUBCASE("Bwd") {
-          bool result = registry_contains_task_for_layer(
-              task_registry, layer_guid, OpTaskType::BWD);
-          CHECK(result == true);
-        }
-      }
-
-      SUBCASE("Partial task does not exist") {
-        ComputationGraphOpAttrs bmm_attrs = ComputationGraphOpAttrs{
-            BatchMatmulAttrs{/*a_seq_length_dim=*/10_n,
-                             /*b_seq_length_dim=*/20_n}};
-        TaskRegistry task_registry = construct_task_registry({
-            {layer_guid, LayerAttrs{bmm_attrs, std::nullopt}},
-        });
-        SUBCASE("Init") {
-          bool result = registry_contains_task_for_layer(
-              task_registry, layer_guid, OpTaskType::INIT);
-          CHECK(result == false);
-        }
-        SUBCASE("Fwd") {
-          bool result = registry_contains_task_for_layer(
-              task_registry, layer_guid, OpTaskType::FWD);
-          CHECK(result == true);
-        }
-        SUBCASE("Bwd") {
-          bool result = registry_contains_task_for_layer(
-              task_registry, layer_guid, OpTaskType::BWD);
-          CHECK(result == true);
-        }
-      }
-
-      SUBCASE("Empty tasks") {
-        std::unordered_map<layer_guid_t, std::optional<task_id_t>>
-            empty_task_ids = {{layer_guid, std::nullopt}};
-        TaskRegistry task_registry =
-            TaskRegistry{empty_task_ids, empty_task_ids, empty_task_ids, {}};
-        SUBCASE("Init") {
-          bool result = registry_contains_task_for_layer(
-              task_registry, layer_guid, OpTaskType::INIT);
-          CHECK(result == false);
-        }
-        SUBCASE("Fwd") {
-          bool result = registry_contains_task_for_layer(
-              task_registry, layer_guid, OpTaskType::FWD);
-          CHECK(result == false);
-        }
-        SUBCASE("Bwd") {
-          bool result = registry_contains_task_for_layer(
-              task_registry, layer_guid, OpTaskType::BWD);
-          CHECK(result == false);
-        }
-      }
-    }
-  }
-}
diff --git a/lib/local-execution/test/src/test_unallocated_tensors.cc b/lib/local-execution/test/src/test_unallocated_tensors.cc
deleted file mode 100644
index 0a0b99e61c..0000000000
--- a/lib/local-execution/test/src/test_unallocated_tensors.cc
+++ /dev/null
@@ -1,440 +0,0 @@
-#include "kernels/local_cpu_allocator.h"
-#include "local-execution/allocated_tensors.h"
-#include "local-execution/gradient_tensor_source.h"
-#include "local-execution/loss_tensor_source.h"
-#include "local-execution/optimizer_tensor_source.h"
-#include "local-execution/unallocated_tensors.h"
-#include "pcg/computation_graph.dtg.h"
-#include "test/utils/doctest/fmt/pair.h"
-#include "test/utils/doctest/fmt/unordered_map.h"
-#include "test/utils/doctest/fmt/variant.h"
-#include "test/utils/doctest/fmt/vector.h"
-#include "test_utils.h"
-#include <doctest/doctest.h>
-
-using namespace ::FlexFlow;
-
-TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("UnallocatedTensors") {
-    MockTensorGuidSource tensor_guid_source;
-    GradientTensorSource gradient_tensor_source;
-    OptimizerTensorSource optimizer_tensor_source;
-
-    gradient_tensor_source.reset();
-    optimizer_tensor_source.reset();
-
-    Allocator allocator = create_local_cpu_memory_allocator();
-
-    tensor_guid_t mock_tensor_1 = tensor_guid_source.new_mock_tensor_guid();
-    tensor_guid_t mock_tensor_2 = tensor_guid_source.new_mock_tensor_guid();
-    tensor_guid_t mock_tensor_3_with_grad =
-        tensor_guid_source.new_mock_tensor_guid();
-
-    gradient_tensor_t grad_tensor =
-        gradient_tensor_source.new_gradient_tensor();
-    optimizer_tensor_t optimizer_tensor_1 =
-        optimizer_tensor_source.new_optimizer_tensor();
-    optimizer_tensor_t optimizer_tensor_2 =
-        optimizer_tensor_source.new_optimizer_tensor();
-
-    TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{
-        TensorShape{TensorDims{FFOrdered{16_p, 10_p}}, DataType::FLOAT},
-        CreateGrad::NO};
-    TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{
-        TensorShape{TensorDims{FFOrdered{16_p, 20_p}}, DataType::FLOAT},
-        CreateGrad::NO};
-    TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{
-        TensorShape{TensorDims{FFOrdered{16_p, 30_p}}, DataType::FLOAT},
-        CreateGrad::YES};
-
-    GenericTensorAccessorW tensor_backing_1 =
-        allocator.allocate_tensor(tensor_attrs_1_no_grad.shape);
-    GenericTensorAccessorW tensor_backing_2 =
-        allocator.allocate_tensor(tensor_attrs_2_no_grad.shape);
-    GenericTensorAccessorW tensor_backing_3 =
-        allocator.allocate_tensor(tensor_attrs_3_with_grad.shape);
-
-    std::unordered_map<tensor_guid_t, TensorAttrs> tensor_attrs_mapping = {
-        {mock_tensor_1, tensor_attrs_1_no_grad},
-        {mock_tensor_2, tensor_attrs_2_no_grad},
-        {mock_tensor_3_with_grad, tensor_attrs_3_with_grad},
-    };
-
-    SUBCASE("Without optimizer") {
-      SUBCASE("AllocatedTensors is empty") {
-        AllocatedTensors empty = AllocatedTensors{{}, {}, {}};
-        gradient_tensor_source.reset();
-        UnallocatedTensors result = generate_unallocated_tensors(
-            empty, tensor_attrs_mapping, gradient_tensor_source);
-
-        std::unordered_map<TensorTypeVariant, TensorShape>
-            correct_tensor_type_shapes = {
-                {TensorTypeVariant{mock_tensor_1},
-                 tensor_attrs_1_no_grad.shape},
-                {TensorTypeVariant{mock_tensor_2},
-                 tensor_attrs_2_no_grad.shape},
-                {TensorTypeVariant{mock_tensor_3_with_grad},
-                 tensor_attrs_3_with_grad.shape},
-                {TensorTypeVariant{grad_tensor},
-                 tensor_attrs_3_with_grad.shape},
-            };
-        UnallocatedTensors correct =
-            UnallocatedTensors{correct_tensor_type_shapes,
-                               {{mock_tensor_3_with_grad, grad_tensor}},
-                               {}};
-        CHECK(result == correct);
-      }
-
-      SUBCASE("AllocatedTensors contains only 1 forward tensor") {
-        AllocatedTensors allocated_forward_tensors = AllocatedTensors{
-            {
-                {TensorTypeVariant{mock_tensor_1}, tensor_backing_1},
-            },
-            {},
-            {}};
-
-        gradient_tensor_source.reset();
-        UnallocatedTensors result =
-            generate_unallocated_tensors(allocated_forward_tensors,
-                                         tensor_attrs_mapping,
-                                         gradient_tensor_source);
-        std::unordered_map<TensorTypeVariant, TensorShape>
-            correct_tensor_type_shapes = {
-                {TensorTypeVariant{mock_tensor_2},
-                 tensor_attrs_2_no_grad.shape},
-                {TensorTypeVariant{mock_tensor_3_with_grad},
-                 tensor_attrs_3_with_grad.shape},
-                {TensorTypeVariant{grad_tensor},
-                 tensor_attrs_3_with_grad.shape},
-            };
-        UnallocatedTensors correct =
-            UnallocatedTensors{correct_tensor_type_shapes,
-                               {{mock_tensor_3_with_grad, grad_tensor}},
-                               {}};
-        CHECK(result == correct);
-      }
-
-      SUBCASE("AllocatedTensors contains only forward tensors") {
-        AllocatedTensors allocated_forward_tensors = AllocatedTensors{
-            {
-                {TensorTypeVariant{mock_tensor_1}, tensor_backing_1},
-                {TensorTypeVariant{mock_tensor_2}, tensor_backing_2},
-                {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_backing_3},
-            },
-            {},
-            {}};
-
-        gradient_tensor_source.reset();
-        UnallocatedTensors result =
-            generate_unallocated_tensors(allocated_forward_tensors,
-                                         tensor_attrs_mapping,
-                                         gradient_tensor_source);
-
-        std::unordered_map<TensorTypeVariant, TensorShape>
-            correct_tensor_type_shapes = {
-                {TensorTypeVariant{grad_tensor},
-                 tensor_attrs_3_with_grad.shape},
-            };
-        UnallocatedTensors correct =
-            UnallocatedTensors{correct_tensor_type_shapes,
-                               {{mock_tensor_3_with_grad, grad_tensor}},
-                               {}};
-        CHECK(result == correct);
-      }
-
-      SUBCASE("AllocatedTensors contains only gradient tensor") {
-
-        AllocatedTensors allocated_forward_tensors = AllocatedTensors{
-            {
-                {TensorTypeVariant{grad_tensor}, tensor_backing_3},
-            },
-            {{mock_tensor_3_with_grad, grad_tensor}},
-            {}};
-        UnallocatedTensors result =
-            generate_unallocated_tensors(allocated_forward_tensors,
-                                         tensor_attrs_mapping,
-                                         gradient_tensor_source);
-
-        std::unordered_map<TensorTypeVariant, TensorShape>
-            correct_tensor_type_shapes = {
-                {TensorTypeVariant{mock_tensor_1},
-                 tensor_attrs_1_no_grad.shape},
-                {TensorTypeVariant{mock_tensor_2},
-                 tensor_attrs_2_no_grad.shape},
-                {TensorTypeVariant{mock_tensor_3_with_grad},
-                 tensor_attrs_3_with_grad.shape},
-            };
-        UnallocatedTensors correct =
-            UnallocatedTensors{correct_tensor_type_shapes, {}, {}};
-        CHECK(result == correct);
-      }
-
-      SUBCASE("AllocatedTensors contains mixture") {
-
-        AllocatedTensors allocated_forward_tensors = AllocatedTensors{
-            {
-                {TensorTypeVariant{mock_tensor_1}, tensor_backing_1},
-                {TensorTypeVariant{grad_tensor}, tensor_backing_3},
-            },
-            {{mock_tensor_3_with_grad, grad_tensor}},
-            {}};
-        UnallocatedTensors result =
-            generate_unallocated_tensors(allocated_forward_tensors,
-                                         tensor_attrs_mapping,
-                                         gradient_tensor_source);
-
-        std::unordered_map<TensorTypeVariant, TensorShape>
-            correct_tensor_type_shapes = {
-                {TensorTypeVariant{mock_tensor_2},
-                 tensor_attrs_2_no_grad.shape},
-                {TensorTypeVariant{mock_tensor_3_with_grad},
-                 tensor_attrs_3_with_grad.shape},
-            };
-        UnallocatedTensors correct =
-            UnallocatedTensors{correct_tensor_type_shapes, {}, {}};
-        CHECK(result == correct);
-      }
-
-      SUBCASE("Fully AllocatedTensors") {
-
-        AllocatedTensors allocated_forward_tensors = AllocatedTensors{
-            {
-                {TensorTypeVariant{mock_tensor_1}, tensor_backing_1},
-                {TensorTypeVariant{mock_tensor_2}, tensor_backing_2},
-                {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_backing_3},
-                {TensorTypeVariant{grad_tensor}, tensor_backing_3},
-            },
-            {{mock_tensor_3_with_grad, grad_tensor}},
-            {}};
-        UnallocatedTensors result =
-            generate_unallocated_tensors(allocated_forward_tensors,
-                                         tensor_attrs_mapping,
-                                         gradient_tensor_source);
-
-        UnallocatedTensors correct = UnallocatedTensors{{}, {}, {}};
-        CHECK(result == correct);
-      }
-    }
-
-    SUBCASE("With optimizer") {
-      SUBCASE("SGD Attrs") {
-        SUBCASE("without momentum") {
-          double momentum = 0.0;
-          OptimizerAttrs attrs =
-              OptimizerAttrs{SGDOptimizerAttrs{0.0, momentum, false, 0.0}};
-          AllocatedTensors empty = AllocatedTensors{{}, {}, {}};
-
-          gradient_tensor_source.reset();
-          UnallocatedTensors result =
-              generate_unallocated_tensors_with_optimizer(
-                  empty,
-                  tensor_attrs_mapping,
-                  gradient_tensor_source,
-                  optimizer_tensor_source,
-                  attrs);
-
-          gradient_tensor_source.reset();
-          UnallocatedTensors correct = generate_unallocated_tensors(
-              empty, tensor_attrs_mapping, gradient_tensor_source);
-          CHECK(result == correct);
-        }
-        SUBCASE("with momentum") {
-          double momentum = 0.9;
-          OptimizerAttrs attrs =
-              OptimizerAttrs{SGDOptimizerAttrs{0.0, momentum, false, 0.0}};
-
-          SUBCASE("unallocated") {
-            AllocatedTensors empty = AllocatedTensors{{}, {}, {}};
-
-            gradient_tensor_source.reset();
-            optimizer_tensor_source.reset();
-            UnallocatedTensors result =
-                generate_unallocated_tensors_with_optimizer(
-                    empty,
-                    tensor_attrs_mapping,
-                    gradient_tensor_source,
-                    optimizer_tensor_source,
-                    attrs);
-
-            std::unordered_map<TensorTypeVariant, TensorShape>
-                correct_tensor_type_shapes = {
-                    {TensorTypeVariant{mock_tensor_1},
-                     tensor_attrs_1_no_grad.shape},
-                    {TensorTypeVariant{mock_tensor_2},
-                     tensor_attrs_2_no_grad.shape},
-                    {TensorTypeVariant{mock_tensor_3_with_grad},
-                     tensor_attrs_3_with_grad.shape},
-                    {TensorTypeVariant{grad_tensor},
-                     tensor_attrs_3_with_grad.shape},
-                    {TensorTypeVariant{optimizer_tensor_1},
-                     tensor_attrs_3_with_grad.shape},
-                };
-            UnallocatedTensors correct = UnallocatedTensors{
-                correct_tensor_type_shapes,
-                {{mock_tensor_3_with_grad, grad_tensor}},
-                {{mock_tensor_3_with_grad, {optimizer_tensor_1}}}};
-
-            CHECK(result == correct);
-          }
-
-          SUBCASE("allocated") {
-
-            AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{
-                {{TensorTypeVariant{optimizer_tensor_1}, tensor_backing_3}},
-                {},
-                {{mock_tensor_3_with_grad, {optimizer_tensor_1}}}};
-
-            gradient_tensor_source.reset();
-            UnallocatedTensors result =
-                generate_unallocated_tensors_with_optimizer(
-                    allocated_optimizer_tensor,
-                    tensor_attrs_mapping,
-                    gradient_tensor_source,
-                    optimizer_tensor_source,
-                    attrs);
-
-            std::unordered_map<TensorTypeVariant, TensorShape>
-                correct_tensor_type_shapes = {
-                    {TensorTypeVariant{mock_tensor_1},
-                     tensor_attrs_1_no_grad.shape},
-                    {TensorTypeVariant{mock_tensor_2},
-                     tensor_attrs_2_no_grad.shape},
-                    {TensorTypeVariant{mock_tensor_3_with_grad},
-                     tensor_attrs_3_with_grad.shape},
-                    {TensorTypeVariant{grad_tensor},
-                     tensor_attrs_3_with_grad.shape},
-                };
-            UnallocatedTensors correct =
-                UnallocatedTensors{correct_tensor_type_shapes,
-                                   {{mock_tensor_3_with_grad, grad_tensor}},
-                                   {}};
-
-            CHECK(result == correct);
-          }
-        }
-      }
-      SUBCASE("Adam Attrs") {
-        OptimizerAttrs attrs =
-            OptimizerAttrs{AdamOptimizerAttrs{/*alpha=*/0.001,
-                                              /*beta1=*/0.9,
-                                              /*beta2=*/0.999,
-                                              /*weight_decay=*/0.001,
-                                              /*alpha_t=*/0.001,
-                                              /*beta_t=*/0.9,
-                                              /*beta2_t=*/0.999,
-                                              /*epsilon=*/1e-8}};
-        SUBCASE("Empty") {
-          AllocatedTensors empty = AllocatedTensors{{}, {}, {}};
-
-          gradient_tensor_source.reset();
-          optimizer_tensor_source.reset();
-          UnallocatedTensors result =
-              generate_unallocated_tensors_with_optimizer(
-                  empty,
-                  tensor_attrs_mapping,
-                  gradient_tensor_source,
-                  optimizer_tensor_source,
-                  attrs);
-
-          std::unordered_map<TensorTypeVariant, TensorShape>
-              correct_tensor_type_shapes = {
-                  {TensorTypeVariant{mock_tensor_1},
-                   tensor_attrs_1_no_grad.shape},
-                  {TensorTypeVariant{mock_tensor_2},
-                   tensor_attrs_2_no_grad.shape},
-                  {TensorTypeVariant{mock_tensor_3_with_grad},
-                   tensor_attrs_3_with_grad.shape},
-                  {TensorTypeVariant{grad_tensor},
-                   tensor_attrs_3_with_grad.shape},
-                  {TensorTypeVariant{optimizer_tensor_1},
-                   tensor_attrs_3_with_grad.shape},
-                  {TensorTypeVariant{optimizer_tensor_2},
-                   tensor_attrs_3_with_grad.shape},
-              };
-          UnallocatedTensors correct =
-              UnallocatedTensors{correct_tensor_type_shapes,
-                                 {{mock_tensor_3_with_grad, grad_tensor}},
-                                 {{mock_tensor_3_with_grad,
-                                   {optimizer_tensor_1, optimizer_tensor_2}}}};
-
-          CHECK(result == correct);
-        }
-        SUBCASE("Partially allocated") {
-          gradient_tensor_source.reset();
-          optimizer_tensor_source.reset();
-          optimizer_tensor_t optimizer_tensor_pre_allocated =
-              optimizer_tensor_source.new_optimizer_tensor();
-          AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{
-              {{TensorTypeVariant{optimizer_tensor_pre_allocated},
-                tensor_backing_3}},
-              {},
-              {{mock_tensor_3_with_grad, {optimizer_tensor_pre_allocated}}}};
-
-          UnallocatedTensors result =
-              generate_unallocated_tensors_with_optimizer(
-                  allocated_optimizer_tensor,
-                  tensor_attrs_mapping,
-                  gradient_tensor_source,
-                  optimizer_tensor_source,
-                  attrs);
-
-          std::unordered_map<TensorTypeVariant, TensorShape>
-              correct_tensor_type_shapes = {
-                  {TensorTypeVariant{mock_tensor_1},
-                   tensor_attrs_1_no_grad.shape},
-                  {TensorTypeVariant{mock_tensor_2},
-                   tensor_attrs_2_no_grad.shape},
-                  {TensorTypeVariant{mock_tensor_3_with_grad},
-                   tensor_attrs_3_with_grad.shape},
-                  {TensorTypeVariant{grad_tensor},
-                   tensor_attrs_3_with_grad.shape},
-                  {TensorTypeVariant{optimizer_tensor_2},
-                   tensor_attrs_3_with_grad.shape},
-              };
-          UnallocatedTensors correct = UnallocatedTensors{
-              correct_tensor_type_shapes,
-              {{mock_tensor_3_with_grad, grad_tensor}},
-              {{mock_tensor_3_with_grad, {optimizer_tensor_2}}}};
-
-          CHECK(result == correct);
-        }
-
-        SUBCASE("Fully allocated") {
-          AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{
-              {{TensorTypeVariant{optimizer_tensor_1}, tensor_backing_3},
-               {TensorTypeVariant{optimizer_tensor_2}, tensor_backing_3}},
-              {},
-              {{mock_tensor_3_with_grad,
-                {optimizer_tensor_1, optimizer_tensor_2}}}};
-
-          gradient_tensor_source.reset();
-          UnallocatedTensors result =
-              generate_unallocated_tensors_with_optimizer(
-                  allocated_optimizer_tensor,
-                  tensor_attrs_mapping,
-                  gradient_tensor_source,
-                  optimizer_tensor_source,
-                  attrs);
-
-          std::unordered_map<TensorTypeVariant, TensorShape>
-              correct_tensor_type_shapes = {
-                  {TensorTypeVariant{mock_tensor_1},
-                   tensor_attrs_1_no_grad.shape},
-                  {TensorTypeVariant{mock_tensor_2},
-                   tensor_attrs_2_no_grad.shape},
-                  {TensorTypeVariant{mock_tensor_3_with_grad},
-                   tensor_attrs_3_with_grad.shape},
-                  {TensorTypeVariant{grad_tensor},
-                   tensor_attrs_3_with_grad.shape},
-              };
-          UnallocatedTensors correct =
-              UnallocatedTensors{correct_tensor_type_shapes,
-                                 {{mock_tensor_3_with_grad, grad_tensor}},
-                                 {}};
-
-          CHECK(result == correct);
-        }
-      }
-    }
-  }
-}
diff --git a/lib/models/src/models/bert/bert.cc b/lib/models/src/models/bert/bert.cc
index bfcab8ffbf..e7b82d012f 100644
--- a/lib/models/src/models/bert/bert.cc
+++ b/lib/models/src/models/bert/bert.cc
@@ -1,5 +1,6 @@
 #include "models/bert/bert.h"
 #include "op-attrs/initializers/truncated_normal_initializer_attrs.dtg.h"
+#include "op-attrs/tensor_dims.h"
 #include "op-attrs/tensor_shape.h"
 #include "pcg/computation_graph.h"
 
@@ -57,37 +58,37 @@ tensor_guid_t
                               tensor_guid_t const &input,
                               InitializerAttrs const &bias_initializer,
                               InitializerAttrs const &projection_initializer) {
-  assert(num_dims(cgb.get_shape(input)) == 3);
-  std::vector<relative_ff_dim_t> layer_norm_axis = {
+  ASSERT(get_num_dims(cgb.get_shape(input).dims) == 3);
+  std::set<relative_ff_dim_t> layer_norm_axis = {
       relative_ff_dim_t{-1}}; // Apply layernorm across the last dim
   positive_int kdim = positive_int{config.dim_feedforward / config.num_heads};
   positive_int vdim = positive_int{config.dim_feedforward / config.num_heads};
   tensor_guid_t self_attention =
-      cgb.multihead_attention(input,
-                              input,
-                              input,
-                              config.hidden_size,
-                              config.num_heads,
-                              kdim,
-                              vdim,
+      cgb.multihead_attention(/*query=*/input,
+                              /*key=*/input,
+                              /*value=*/input,
+                              /*embed_dim=*/config.hidden_size,
+                              /*num_heads=*/config.num_heads,
+                              /*kdim=*/kdim,
+                              /*vdim=*/vdim,
                               /*dropout=*/config.attention_probs_dropout_prob,
                               /*bias=*/true,
                               /*add_bias_kv=*/false,
                               /*add_zero_attn=*/false,
                               /*initializer=*/projection_initializer);
-  assert(are_tensor_guid_shapes_equivalent(
+  ASSERT(are_tensor_guid_shapes_equivalent(
       cgb.computation_graph, input, self_attention));
 
   tensor_guid_t normalized = cgb.layer_norm(cgb.add(self_attention, input),
                                             layer_norm_axis,
                                             /*elementwise_affine=*/true,
                                             config.layer_norm_eps);
-  assert(are_tensor_guid_shapes_equivalent(
+  ASSERT(are_tensor_guid_shapes_equivalent(
       cgb.computation_graph, input, normalized));
 
   tensor_guid_t feedforward_output = create_feedforward_network(
       cgb, config, normalized, bias_initializer, projection_initializer);
-  assert(are_tensor_guid_shapes_equivalent(
+  ASSERT(are_tensor_guid_shapes_equivalent(
       cgb.computation_graph, input, feedforward_output));
   return cgb.layer_norm(cgb.add(normalized, feedforward_output),
                         layer_norm_axis,
@@ -138,7 +139,7 @@ ComputationGraph get_bert_computation_graph(BertConfig const &config) {
 
   tensor_guid_t encoder_output = create_bert_encoder(
       cgb, config, input, bias_initializer, projection_initializer);
-  assert(are_tensor_guid_shapes_equivalent(
+  ASSERT(are_tensor_guid_shapes_equivalent(
       cgb.computation_graph, input, encoder_output));
 
   tensor_guid_t out_prob =
@@ -149,7 +150,7 @@ ComputationGraph get_bert_computation_graph(BertConfig const &config) {
                             /*data_type=*/DataType::FLOAT,
                             /*projection_initializer=*/projection_initializer,
                             /*bias_initializer=*/bias_initializer));
-  assert(
+  ASSERT(
       (cgb.get_shape(out_prob) ==
        TensorShape{
            TensorDims{FFOrdered<positive_int>{
diff --git a/lib/models/src/models/candle_uno/candle_uno.cc b/lib/models/src/models/candle_uno/candle_uno.cc
index 8bbbccdbaf..13dd650c2c 100644
--- a/lib/models/src/models/candle_uno/candle_uno.cc
+++ b/lib/models/src/models/candle_uno/candle_uno.cc
@@ -85,7 +85,8 @@ ComputationGraph
   for (auto const &input_feature : config.input_features) {
     std::string const &feature_name = input_feature.second;
     positive_int shape = config.feature_shapes.at(feature_name);
-    tensor_guid_t input = create_input_tensor({config.batch_size, shape});
+    tensor_guid_t input =
+        create_input_tensor(FFOrdered{config.batch_size, shape});
     all_inputs.push_back(input);
 
     if (contains(input_models, feature_name)) {
diff --git a/lib/models/src/models/dlrm/dlrm.cc b/lib/models/src/models/dlrm/dlrm.cc
index d1dd52b4da..8e06a2dd6a 100644
--- a/lib/models/src/models/dlrm/dlrm.cc
+++ b/lib/models/src/models/dlrm/dlrm.cc
@@ -129,11 +129,12 @@ ComputationGraph get_dlrm_computation_graph(DLRMConfig const &config) {
   std::vector<tensor_guid_t> sparse_inputs =
       repeat(num_elements(config.embedding_size), [&]() {
         return create_input_tensor(
-            {config.batch_size, config.embedding_bag_size}, DataType::INT64);
+            FFOrdered{config.batch_size, config.embedding_bag_size},
+            DataType::INT64);
       });
 
   tensor_guid_t dense_input = create_input_tensor(
-      {config.batch_size, config.dense_arch_layer_sizes.front()},
+      FFOrdered{config.batch_size, config.dense_arch_layer_sizes.front()},
       DataType::FLOAT);
 
   // Construct the model
diff --git a/lib/models/src/models/transformer/transformer.cc b/lib/models/src/models/transformer/transformer.cc
index dfc40a5720..5298c7682b 100644
--- a/lib/models/src/models/transformer/transformer.cc
+++ b/lib/models/src/models/transformer/transformer.cc
@@ -32,7 +32,7 @@ tensor_guid_t create_feedforward_network(ComputationGraphBuilder &cgb,
 tensor_guid_t create_transformer_encoder_layer(ComputationGraphBuilder &cgb,
                                                TransformerConfig const &config,
                                                tensor_guid_t const &input) {
-  std::vector<relative_ff_dim_t> layer_norm_axis = {
+  std::set<relative_ff_dim_t> layer_norm_axis = {
       relative_ff_dim_t{-1}}; // Normalize the last dim
   positive_int kdim = positive_int{config.dim_feedforward / config.num_heads};
   positive_int vdim = positive_int{config.dim_feedforward / config.num_heads};
@@ -81,7 +81,7 @@ tensor_guid_t
                                      TransformerConfig const &config,
                                      tensor_guid_t const &input,
                                      tensor_guid_t const &encoder_output) {
-  std::vector<relative_ff_dim_t> layer_norm_axis = {
+  std::set<relative_ff_dim_t> layer_norm_axis = {
       relative_ff_dim_t{-1}}; // Normalize the last dim
   positive_int kdim = positive_int{config.dim_feedforward / config.num_heads};
   positive_int vdim = positive_int{config.dim_feedforward / config.num_heads};
diff --git a/lib/op-attrs/include/op-attrs/datatype.h b/lib/op-attrs/include/op-attrs/datatype.h
index ad45dcb13c..eab346f41f 100644
--- a/lib/op-attrs/include/op-attrs/datatype.h
+++ b/lib/op-attrs/include/op-attrs/datatype.h
@@ -3,7 +3,7 @@
 
 #include "op-attrs/datatype.dtg.h"
 #include "utils/fmt.h"
-#include "utils/fp16.h"
+#include "utils/half.h"
 #include "utils/positive_int/positive_int.h"
 #include <variant>
 
diff --git a/lib/op-attrs/include/op-attrs/datatype_value.h b/lib/op-attrs/include/op-attrs/datatype_value.h
index b646692de9..fcd1245b54 100644
--- a/lib/op-attrs/include/op-attrs/datatype_value.h
+++ b/lib/op-attrs/include/op-attrs/datatype_value.h
@@ -3,15 +3,19 @@
 
 #include "op-attrs/datatype.dtg.h"
 #include "op-attrs/datatype_value.dtg.h"
+#include "utils/half.h"
 
 namespace FlexFlow {
 
+DataTypeValue make_half_data_type_value(half value);
 DataTypeValue make_float_data_type_value(float value);
 DataTypeValue make_double_data_type_value(double value);
 DataTypeValue make_int32_data_type_value(int32_t value);
 DataTypeValue make_int64_data_type_value(int64_t value);
 DataTypeValue make_bool_data_type_value(bool value);
 
+DataTypeValue make_zero_data_type_value_of_type(DataType);
+
 DataType get_data_type_of_data_type_value(DataTypeValue);
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/include/op-attrs/datatype_value.variant.toml b/lib/op-attrs/include/op-attrs/datatype_value.variant.toml
index 3386e9d131..4c867917b0 100644
--- a/lib/op-attrs/include/op-attrs/datatype_value.variant.toml
+++ b/lib/op-attrs/include/op-attrs/datatype_value.variant.toml
@@ -9,6 +9,19 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/half.h",
+]
+
+src_includes = [
+  "utils/json/half.h",
+  "utils/rapidcheck/half.h",
+  "utils/fmt/half.h",
+]
+
+[[values]]
+type = "half"
+
 [[values]]
 type = "float"
 
diff --git a/lib/op-attrs/include/op-attrs/ff_dim_t.h b/lib/op-attrs/include/op-attrs/ff_dim_t.h
index 5fab792b13..0979201f67 100644
--- a/lib/op-attrs/include/op-attrs/ff_dim_t.h
+++ b/lib/op-attrs/include/op-attrs/ff_dim_t.h
@@ -6,7 +6,11 @@
 #include "rapidcheck.h"
 
 namespace FlexFlow {
+
 relative_ff_dim_t relative_ff_dim_t_from_ff_dim_t(ff_dim_t ff_dim);
+
+ff_dim_t add_to_ff_dim(ff_dim_t ff_dim, int value);
+
 } // namespace FlexFlow
 
 namespace rc {
diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h
index 92ed211c31..fe2e8d9dc6 100644
--- a/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h
@@ -12,16 +12,14 @@ template <typename T>
 struct FFOrdered {
   FFOrdered() {}
 
-  FFOrdered(std::initializer_list<T> const &l) : contents(l.begin(), l.end()) {}
-
-  FFOrdered(std::vector<T> const &contents)
-      : contents(contents.begin(), contents.end()) {}
+  explicit FFOrdered(std::initializer_list<T> const &l)
+      : contents(l.begin(), l.end()) {}
 
   template <typename It>
-  FFOrdered(It begin, It end) : contents(begin, end) {}
+  explicit FFOrdered(It begin, It end) : contents(begin, end) {}
 
   template <size_t MAXSIZE>
-  FFOrdered(stack_vector<T, MAXSIZE> const &contents)
+  explicit FFOrdered(stack_vector<T, MAXSIZE> const &contents)
       : contents(contents.begin(), contents.end()) {}
 
   T const &at(ff_dim_t idx) const {
@@ -190,7 +188,8 @@ namespace nlohmann {
 template <typename T>
 struct adl_serializer<::FlexFlow::FFOrdered<T>> {
   static ::FlexFlow::FFOrdered<T> from_json(nlohmann::json const &j) {
-    return {j.template get<std::vector<T>>()};
+    std::vector<T> v = j.template get<std::vector<T>>();
+    return ::FlexFlow::FFOrdered<T>(v.cbegin(), v.cend());
   }
 
   static void to_json(nlohmann::json &j, ::FlexFlow::FFOrdered<T> const &x) {
diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/filtrans.h b/lib/op-attrs/include/op-attrs/ff_ordered/filtrans.h
new file mode 100644
index 0000000000..d41e68342a
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/filtrans.h
@@ -0,0 +1,20 @@
+#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_FILTRANS_H
+#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_FILTRANS_H
+
+#include "op-attrs/ff_ordered/ff_ordered.h"
+#include "op-attrs/ff_ordered/ff_ordered_of.h"
+#include "utils/containers/filtrans.h"
+#include "utils/containers/vector_of.h"
+
+namespace FlexFlow {
+
+template <typename F,
+          typename In,
+          typename Out = unwrap_optional_t<std::invoke_result_t<F, In>>>
+FFOrdered<Out> filtrans(FFOrdered<In> const &v, F &&f) {
+  return ff_ordered_of(filtrans(vector_of(v), f));
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/reversed.h b/lib/op-attrs/include/op-attrs/ff_ordered/reversed.h
new file mode 100644
index 0000000000..0986bf560d
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/reversed.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_REVERSED_H
+#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_REVERSED_H
+
+#include "op-attrs/ff_ordered/ff_ordered.h"
+
+namespace FlexFlow {
+
+template <typename T>
+FFOrdered<T> reversed(FFOrdered<T> const &t) {
+  FFOrdered<T> result(std::crbegin(t), std::crend(t));
+  return result;
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/slice.h b/lib/op-attrs/include/op-attrs/ff_ordered/slice.h
index 79217c4cc3..c8ca49d4cf 100644
--- a/lib/op-attrs/include/op-attrs/ff_ordered/slice.h
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/slice.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_SLICE_H
 
 #include "op-attrs/ff_ordered/ff_ordered.h"
+#include "op-attrs/ff_ordered/ff_ordered_of.h"
 #include "utils/containers/slice.h"
 #include "utils/containers/transform.h"
 #include "utils/containers/vector_of.h"
@@ -15,7 +16,7 @@ FFOrdered<T> ff_dim_t_nonoverloaded_slice(FFOrdered<T> const &d,
   int raw_start = start.value.unwrap_nonnegative();
   std::optional<int> raw_end = transform(
       end, [](ff_dim_t const &i) { return i.value.unwrap_nonnegative(); });
-  return FFOrdered<T>{slice(vector_of(d), raw_start, raw_end)};
+  return ff_ordered_of(slice(vector_of(d), raw_start, raw_end));
 }
 
 template <typename T>
@@ -27,7 +28,7 @@ FFOrdered<T> relative_ff_dim_t_nonoverloaded_slice(
   std::optional<int> raw_end =
       transform(end, [](relative_ff_dim_t const &i) { return i.value; });
 
-  return FFOrdered<T>{slice(vector_of(d), raw_start, raw_end)};
+  return ff_ordered_of(slice(vector_of(d), raw_start, raw_end));
 }
 
 template <typename T>
diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/transform.h b/lib/op-attrs/include/op-attrs/ff_ordered/transform.h
index 3a8eeb9ecf..c7ee3c2c54 100644
--- a/lib/op-attrs/include/op-attrs/ff_ordered/transform.h
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/transform.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_TRANSFORM_H
 
 #include "op-attrs/ff_ordered/ff_ordered.h"
+#include "op-attrs/ff_ordered/ff_ordered_of.h"
 #include "utils/containers/vector_of.h"
 #include "utils/containers/vector_transform.h"
 
@@ -9,7 +10,7 @@ namespace FlexFlow {
 
 template <typename T, typename F, typename Out = std::invoke_result_t<F, T>>
 FFOrdered<Out> transform(FFOrdered<T> const &d, F &&f) {
-  return FFOrdered<Out>{vector_transform(vector_of(d), f)};
+  return ff_ordered_of(vector_transform(vector_of(d), f));
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/zip.h b/lib/op-attrs/include/op-attrs/ff_ordered/zip.h
index fe207740f7..42ca3d69a3 100644
--- a/lib/op-attrs/include/op-attrs/ff_ordered/zip.h
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/zip.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_ZIP_H
 
 #include "op-attrs/ff_ordered/ff_ordered.h"
+#include "op-attrs/ff_ordered/ff_ordered_of.h"
 #include "utils/containers/vector_of.h"
 #include "utils/containers/zip.h"
 
@@ -10,7 +11,7 @@ namespace FlexFlow {
 template <typename T1, typename T2>
 FFOrdered<std::pair<T1, T2>> zip(FFOrdered<T1> const &lhs,
                                  FFOrdered<T2> const &rhs) {
-  return FFOrdered<std::pair<T1, T2>>{zip(vector_of(lhs), vector_of(rhs))};
+  return ff_ordered_of(zip(vector_of(lhs), vector_of(rhs)));
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/zip_with.cc b/lib/op-attrs/include/op-attrs/ff_ordered/zip_with.cc
new file mode 100644
index 0000000000..63be94ab9c
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/zip_with.cc
@@ -0,0 +1,14 @@
+#include "op-attrs/ff_ordered/zip_with.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T1 = value_type<0>;
+using T2 = value_type<1>;
+using Result = value_type<2>;
+using F = std::function<Result(T1 const &, T2 const &)>;
+
+template FFOrdered<Result>
+    zip_with(FFOrdered<T1> const &, FFOrdered<T2> const &, F &&);
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/zip_with.h b/lib/op-attrs/include/op-attrs/ff_ordered/zip_with.h
new file mode 100644
index 0000000000..25ae7e5a55
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/ff_ordered/zip_with.h
@@ -0,0 +1,22 @@
+#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_ZIP_WITH_H
+#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_ZIP_WITH_H
+
+#include "op-attrs/ff_ordered/ff_ordered.h"
+#include "op-attrs/ff_ordered/ff_ordered_of.h"
+#include "utils/containers/vector_of.h"
+#include "utils/containers/zip_with.h"
+
+namespace FlexFlow {
+
+template <typename T1,
+          typename T2,
+          typename F,
+          typename Result = std::invoke_result_t<F, T1, T2>>
+FFOrdered<Result>
+    zip_with(FFOrdered<T1> const &lhs, FFOrdered<T2> const &rhs, F &&f) {
+  return ff_ordered_of(zip_with(vector_of(lhs), vector_of(rhs), f));
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/op-attrs/include/op-attrs/ops/gather_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/gather_attrs.struct.toml
index 66d475aa46..f76c7c683f 100644
--- a/lib/op-attrs/include/op-attrs/ops/gather_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/gather_attrs.struct.toml
@@ -10,8 +10,11 @@ features = [
 ]
 
 includes = [
+  "op-attrs/ff_dim_t.dtg.h",
+]
+
+src_includes = [
   "op-attrs/ff_dim_t.h",
-  "op-attrs/ff_dim_t.dtg.h"
 ]
 
 [[fields]]
diff --git a/lib/op-attrs/include/op-attrs/ops/layer_norm_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/layer_norm_attrs.struct.toml
index d2a539e140..12e29d8a60 100644
--- a/lib/op-attrs/include/op-attrs/ops/layer_norm_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/layer_norm_attrs.struct.toml
@@ -10,14 +10,19 @@ features = [
 ]
 
 includes = [
-  "op-attrs/ff_dim_t.h",
   "op-attrs/ff_dim_t.dtg.h",
-  "utils/stack_vector/stack_vector.h",
+  "<set>",
+]
+
+src_includes = [
+  "utils/fmt/set.h",
+  "utils/hash/set.h",
+  "op-attrs/ff_dim_t.h",
 ]
 
 [[fields]]
 name = "axes"
-type = "::FlexFlow::stack_vector<::FlexFlow::ff_dim_t, MAX_TENSOR_DIM>"
+type = "std::set<::FlexFlow::ff_dim_t>"
 
 [[fields]]
 name = "elementwise_affine"
diff --git a/lib/op-attrs/include/op-attrs/tensor_dims.h b/lib/op-attrs/include/op-attrs/tensor_dims.h
index a21602e28c..0f5b987944 100644
--- a/lib/op-attrs/include/op-attrs/tensor_dims.h
+++ b/lib/op-attrs/include/op-attrs/tensor_dims.h
@@ -3,21 +3,55 @@
 
 #include "op-attrs/parallel_tensor_dims.dtg.h"
 #include "op-attrs/tensor_dims.dtg.h"
+#include "op-attrs/tensor_dims_coord.dtg.h"
+#include "utils/bidict/bidict.h"
 
 namespace FlexFlow {
 
 FFOrdered<positive_int> const &ff_ordered(TensorDims const &);
 
-nonnegative_int num_dims(TensorDims const &);
+bool tensor_dims_has_dim(TensorDims const &, ff_dim_t);
+
+nonnegative_int get_num_dims(TensorDims const &);
+
 positive_int dim_at_idx(TensorDims const &, relative_ff_dim_t);
 positive_int &dim_at_idx(TensorDims &, relative_ff_dim_t);
+
+positive_int dim_at_idx(TensorDims const &, ff_dim_t);
+positive_int &dim_at_idx(TensorDims &, ff_dim_t);
+
+std::optional<positive_int> try_dim_at_idx(TensorDims const &,
+                                           relative_ff_dim_t);
+std::optional<positive_int> try_dim_at_idx(TensorDims const &, ff_dim_t);
+
 positive_int get_num_elements(TensorDims const &);
 
 bool tensor_dims_is_broadcastable_to(TensorDims const &curr,
                                      TensorDims const &goal);
+
+bool tensor_dims_contains_coord(TensorDims const &tensor_dims,
+                                TensorDimsCoord const &coord);
+
+TensorDimsCoord get_broadcast_src_coord(TensorDims const &input_dims,
+                                        TensorDims const &output_dims,
+                                        TensorDimsCoord const &dst_coord);
+
+std::unordered_set<TensorDimsCoord>
+    get_tensor_dims_coord_set(TensorDims const &tensor_dims);
+
+std::unordered_set<ff_dim_t> get_ff_dim_t_set(TensorDims const &);
+
 std::optional<TensorDims>
     get_broadcast_target_dims(std::unordered_set<TensorDims> const &);
 
+TensorDims
+    tensor_dims_drop_dims(TensorDims const &dims,
+                          std::function<bool(ff_dim_t)> const &should_drop_dim);
+
+TensorDims slice_tensor_dims(TensorDims const &,
+                             ff_dim_t const &start,
+                             std::optional<ff_dim_t> const &stop);
+
 TensorDims slice_tensor_dims(TensorDims const &,
                              relative_ff_dim_t const &start,
                              std::optional<relative_ff_dim_t> const &stop);
diff --git a/lib/op-attrs/include/op-attrs/tensor_dims_coord.h b/lib/op-attrs/include/op-attrs/tensor_dims_coord.h
new file mode 100644
index 0000000000..44448c5f96
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/tensor_dims_coord.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_TENSOR_DIMS_COORD_H
+#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_TENSOR_DIMS_COORD_H
+
+#include "op-attrs/tensor_dims_coord.dtg.h"
+
+namespace FlexFlow {
+
+nonnegative_int
+    tensor_dims_coord_get_num_dims(TensorDimsCoord const &tensor_dims_coord);
+
+TensorDimsCoord tensor_dims_coord_drop_dims(
+    TensorDimsCoord const &coord,
+    std::function<bool(ff_dim_t)> const &should_drop_dim);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/kernels/include/kernels/array_coord.struct.toml b/lib/op-attrs/include/op-attrs/tensor_dims_coord.struct.toml
similarity index 74%
rename from lib/kernels/include/kernels/array_coord.struct.toml
rename to lib/op-attrs/include/op-attrs/tensor_dims_coord.struct.toml
index 8ce121f2bf..53f4405389 100644
--- a/lib/kernels/include/kernels/array_coord.struct.toml
+++ b/lib/op-attrs/include/op-attrs/tensor_dims_coord.struct.toml
@@ -1,17 +1,16 @@
 namespace = "FlexFlow"
-name = "ArrayCoord"
+name = "TensorDimsCoord"
 features = [
   "eq",
   "ord",
   "hash",
-  "fmt",
-  "rapidcheck",
   "json",
+  "fmt",
 ]
 
 includes = [
   "op-attrs/ff_ordered/ff_ordered.h",
-  "utils/nonnegative_int/nonnegative_int.h"
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
diff --git a/lib/op-attrs/include/op-attrs/tensor_shape.h b/lib/op-attrs/include/op-attrs/tensor_shape.h
index 3cafdda4b8..403b853fab 100644
--- a/lib/op-attrs/include/op-attrs/tensor_shape.h
+++ b/lib/op-attrs/include/op-attrs/tensor_shape.h
@@ -2,14 +2,15 @@
 #define _FLEXFLOW_OPATTRS_TENSOR_SHAPE_H
 
 #include "op-attrs/tensor_shape.dtg.h"
+#include "utils/units/num_bytes_t.h"
 
 namespace FlexFlow {
 
-nonnegative_int num_dims(TensorShape const &);
-positive_int dim_at_idx(TensorShape const &, relative_ff_dim_t);
-positive_int &dim_at_idx(TensorShape &, relative_ff_dim_t);
-positive_int get_num_elements(TensorShape const &);
-positive_int get_size_in_bytes(TensorShape const &);
+num_bytes_t get_size_in_bytes(TensorShape const &);
+
+TensorShape tensor_shape_drop_dims(
+    TensorShape const &coord,
+    std::function<bool(ff_dim_t)> const &should_drop_dim);
 
 TensorShape slice_tensor_shape(TensorShape const &,
                                relative_ff_dim_t const &start,
diff --git a/lib/op-attrs/src/op-attrs/datatype_value.cc b/lib/op-attrs/src/op-attrs/datatype_value.cc
index a4abde2cb4..620b342cd8 100644
--- a/lib/op-attrs/src/op-attrs/datatype_value.cc
+++ b/lib/op-attrs/src/op-attrs/datatype_value.cc
@@ -1,8 +1,13 @@
 #include "op-attrs/datatype_value.h"
 #include "utils/overload.h"
+#include <libassert/assert.hpp>
 
 namespace FlexFlow {
 
+DataTypeValue make_half_data_type_value(half value) {
+  return DataTypeValue{value};
+}
+
 DataTypeValue make_float_data_type_value(float value) {
   return DataTypeValue{value};
 }
@@ -25,6 +30,7 @@ DataTypeValue make_bool_data_type_value(bool value) {
 
 DataType get_data_type_of_data_type_value(DataTypeValue value) {
   return value.visit<DataType>(overload{
+      [](half) { return DataType::HALF; },
       [](float) { return DataType::FLOAT; },
       [](double) { return DataType::DOUBLE; },
       [](int32_t) { return DataType::INT32; },
@@ -33,4 +39,36 @@ DataType get_data_type_of_data_type_value(DataTypeValue value) {
   });
 }
 
+DataTypeValue make_zero_data_type_value_of_type(DataType data_type) {
+  std::optional<DataTypeValue> result = std::nullopt;
+
+  switch (data_type) {
+    case DataType::HALF:
+      result = make_half_data_type_value(0.0);
+      break;
+    case DataType::FLOAT:
+      result = make_float_data_type_value(0.0);
+      break;
+    case DataType::DOUBLE:
+      result = make_double_data_type_value(0.0);
+      break;
+    case DataType::INT32:
+      result = make_int32_data_type_value(0);
+      break;
+    case DataType::INT64:
+      result = make_int64_data_type_value(0);
+      break;
+    case DataType::BOOL:
+      result = make_bool_data_type_value(false);
+      break;
+    default:
+      PANIC("Unhandled DataType value", data_type);
+  };
+
+  ASSERT(result.has_value());
+  ASSERT(get_data_type_of_data_type_value(result.value()) == data_type);
+
+  return result.value();
+}
+
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ff_dim_t.cc b/lib/op-attrs/src/op-attrs/ff_dim_t.cc
index 44672fc391..63c783d909 100644
--- a/lib/op-attrs/src/op-attrs/ff_dim_t.cc
+++ b/lib/op-attrs/src/op-attrs/ff_dim_t.cc
@@ -1,9 +1,16 @@
 #include "op-attrs/ff_dim_t.h"
 
 namespace FlexFlow {
+
 relative_ff_dim_t relative_ff_dim_t_from_ff_dim_t(ff_dim_t ff_dim) {
+
   return relative_ff_dim_t{ff_dim.value.unwrap_nonnegative()};
 }
+
+ff_dim_t add_to_ff_dim(ff_dim_t ff_dim, int value) {
+  return ff_dim_t{nonnegative_int{ff_dim.value.unwrap_nonnegative() + value}};
+}
+
 } // namespace FlexFlow
 
 namespace rc {
diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/filtrans.cc b/lib/op-attrs/src/op-attrs/ff_ordered/filtrans.cc
new file mode 100644
index 0000000000..ff5e4c4af7
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/ff_ordered/filtrans.cc
@@ -0,0 +1,12 @@
+#include "op-attrs/ff_ordered/filtrans.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using In = value_type<0>;
+using Out = value_type<1>;
+using F = std::function<std::optional<Out>(In const &)>;
+
+template FFOrdered<Out> filtrans(FFOrdered<In> const &, F &&);
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/reversed.cc b/lib/op-attrs/src/op-attrs/ff_ordered/reversed.cc
new file mode 100644
index 0000000000..5e8f2eb6e3
--- /dev/null
+++ b/lib/op-attrs/src/op-attrs/ff_ordered/reversed.cc
@@ -0,0 +1,10 @@
+#include "op-attrs/ff_ordered/reversed.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template FFOrdered<T> reversed(FFOrdered<T> const &);
+
+} // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/ops/attention.cc b/lib/op-attrs/src/op-attrs/ops/attention.cc
index 5800f086ef..cc6ef8cfac 100644
--- a/lib/op-attrs/src/op-attrs/ops/attention.cc
+++ b/lib/op-attrs/src/op-attrs/ops/attention.cc
@@ -2,6 +2,7 @@
 #include "op-attrs/ops/attention/multihead_attention_inputs.h"
 #include "op-attrs/ops/attention/multihead_attention_parallel_inputs.h"
 #include "op-attrs/parallel_tensor_shape.h"
+#include "op-attrs/tensor_dims.h"
 #include "op-attrs/tensor_shape.h"
 #include "utils/containers/extend.h"
 #include "utils/expected.h"
@@ -34,15 +35,15 @@ positive_int get_oProjSize(MultiHeadAttentionAttrs const &attrs) {
 }
 
 positive_int get_qSize(TensorShape const &query_shape) {
-  return dim_at_idx(query_shape, relative_ff_dim_t{0});
+  return dim_at_idx(query_shape.dims, relative_ff_dim_t{0});
 }
 
 positive_int get_kSize(TensorShape const &key_shape) {
-  return dim_at_idx(key_shape, relative_ff_dim_t{0});
+  return dim_at_idx(key_shape.dims, relative_ff_dim_t{0});
 }
 
 positive_int get_vSize(TensorShape const &value_shape) {
-  return dim_at_idx(value_shape, relative_ff_dim_t{0});
+  return dim_at_idx(value_shape.dims, relative_ff_dim_t{0});
 }
 
 positive_int get_qSize(MultiHeadAttentionParallelInputs const &inputs) {
diff --git a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc
index 7bf3b9d91e..102e54cbe3 100644
--- a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc
+++ b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc
@@ -1,4 +1,5 @@
 #include "op-attrs/ops/attention/multihead_attention_inputs.h"
+#include "op-attrs/tensor_dims.h"
 #include "op-attrs/tensor_shape.h"
 
 namespace FlexFlow {
@@ -12,28 +13,28 @@ tl::expected<MultiHeadAttentionInputs, std::string>
     parse_attention_input_shape(TensorShape const &input_q,
                                 TensorShape const &input_k,
                                 TensorShape const &input_v) {
-  if (num_dims(input_q) != 3) {
+  if (get_num_dims(input_q.dims) != 3) {
     return tl::unexpected(
         fmt::format("Query input has incorrect number of dims: {} != {}",
-                    num_dims(input_q),
+                    get_num_dims(input_q.dims),
                     3));
   }
-  if (num_dims(input_k) != 3) {
+  if (get_num_dims(input_k.dims) != 3) {
     return tl::unexpected(
         fmt::format("Key input has incorrect number of dims: {} != {}",
-                    num_dims(input_k),
+                    get_num_dims(input_k.dims),
                     3));
   }
-  if (num_dims(input_v) != 3) {
+  if (get_num_dims(input_v.dims) != 3) {
     return tl::unexpected(
         fmt::format("Value input has incorrect number of dims: {} != {}",
-                    num_dims(input_v),
+                    get_num_dims(input_v.dims),
                     3));
   }
 
-  positive_int seq_len_q = dim_at_idx(input_q, relative_ff_dim_t{-2});
-  positive_int seq_len_k = dim_at_idx(input_k, relative_ff_dim_t{-2});
-  positive_int seq_len_v = dim_at_idx(input_v, relative_ff_dim_t{-2});
+  positive_int seq_len_q = dim_at_idx(input_q.dims, relative_ff_dim_t{-2});
+  positive_int seq_len_k = dim_at_idx(input_k.dims, relative_ff_dim_t{-2});
+  positive_int seq_len_v = dim_at_idx(input_v.dims, relative_ff_dim_t{-2});
 
   if (!all_same(seq_len_q, seq_len_k, seq_len_v)) {
     return tl::unexpected(fmt::format(
@@ -43,9 +44,9 @@ tl::expected<MultiHeadAttentionInputs, std::string>
         seq_len_v));
   }
 
-  positive_int batch_size_q = dim_at_idx(input_q, relative_ff_dim_t{-3});
-  positive_int batch_size_k = dim_at_idx(input_k, relative_ff_dim_t{-3});
-  positive_int batch_size_v = dim_at_idx(input_v, relative_ff_dim_t{-3});
+  positive_int batch_size_q = dim_at_idx(input_q.dims, relative_ff_dim_t{-3});
+  positive_int batch_size_k = dim_at_idx(input_k.dims, relative_ff_dim_t{-3});
+  positive_int batch_size_v = dim_at_idx(input_v.dims, relative_ff_dim_t{-3});
 
   if (!all_same(batch_size_q, batch_size_k, batch_size_v)) {
     return tl::unexpected(fmt::format(
@@ -63,9 +64,9 @@ tl::expected<MultiHeadAttentionInputs, std::string>
         input_v.data_type));
   }
 
-  positive_int q_size = dim_at_idx(input_q, relative_ff_dim_t{-1});
-  positive_int k_size = dim_at_idx(input_k, relative_ff_dim_t{-1});
-  positive_int v_size = dim_at_idx(input_v, relative_ff_dim_t{-1});
+  positive_int q_size = dim_at_idx(input_q.dims, relative_ff_dim_t{-1});
+  positive_int k_size = dim_at_idx(input_k.dims, relative_ff_dim_t{-1});
+  positive_int v_size = dim_at_idx(input_v.dims, relative_ff_dim_t{-1});
 
   return MultiHeadAttentionInputs{
       batch_size_q,
diff --git a/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc b/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc
index 33c4987233..3c76561d17 100644
--- a/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc
+++ b/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc
@@ -1,5 +1,6 @@
 #include "op-attrs/ops/batch_matmul.h"
 #include "op-attrs/parallel_tensor_shape.h"
+#include "op-attrs/tensor_dims.h"
 
 namespace FlexFlow {
 
@@ -39,16 +40,16 @@ tl::expected<TensorShape, std::string>
   // out will be a (b×n×p) tensor.
   // https://pytorch.org/docs/stable/generated/torch.bmm.html
 
-  if (num_dims(input_lhs) != 3) {
+  if (get_num_dims(input_lhs.dims) != 3) {
     return tl::unexpected(
         fmt::format("LHS input has incorrect number of shard dims: {} != {}",
-                    num_dims(input_lhs),
+                    get_num_dims(input_lhs.dims),
                     3));
   }
-  if (num_dims(input_rhs) != 3) {
+  if (get_num_dims(input_rhs.dims) != 3) {
     return tl::unexpected(
         fmt::format("RHS input has incorrect number of shard dims: {} != {}",
-                    num_dims(input_rhs),
+                    get_num_dims(input_rhs.dims),
                     3));
   }
   if (input_lhs.data_type != input_rhs.data_type) {
@@ -57,13 +58,13 @@ tl::expected<TensorShape, std::string>
                                       input_rhs.data_type));
   }
 
-  positive_int lhs_b = dim_at_idx(input_lhs, relative_ff_dim_t{0});
-  positive_int n = dim_at_idx(input_lhs, relative_ff_dim_t{1});
-  positive_int lhs_m = dim_at_idx(input_lhs, relative_ff_dim_t{2});
+  positive_int lhs_b = dim_at_idx(input_lhs.dims, relative_ff_dim_t{0});
+  positive_int n = dim_at_idx(input_lhs.dims, relative_ff_dim_t{1});
+  positive_int lhs_m = dim_at_idx(input_lhs.dims, relative_ff_dim_t{2});
 
-  positive_int rhs_b = dim_at_idx(input_rhs, relative_ff_dim_t{0});
-  positive_int rhs_m = dim_at_idx(input_rhs, relative_ff_dim_t{1});
-  positive_int p = dim_at_idx(input_rhs, relative_ff_dim_t{2});
+  positive_int rhs_b = dim_at_idx(input_rhs.dims, relative_ff_dim_t{0});
+  positive_int rhs_m = dim_at_idx(input_rhs.dims, relative_ff_dim_t{1});
+  positive_int p = dim_at_idx(input_rhs.dims, relative_ff_dim_t{2});
 
   if (lhs_b != rhs_b) {
     return tl::unexpected(
diff --git a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc
index f42467320b..cfe5bafaba 100644
--- a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc
+++ b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc
@@ -2,6 +2,7 @@
 #include "op-attrs/ff_ordered/concat.h"
 #include "op-attrs/ff_ordered/slice.h"
 #include "op-attrs/parallel_tensor_shape.h"
+#include "op-attrs/tensor_dims.h"
 #include "op-attrs/tensor_shape.h"
 #include "utils/containers/any_of.h"
 #include "utils/containers/extend.h"
@@ -23,7 +24,7 @@ std::vector<IncomingTensorRole>
 
 static std::optional<std::string>
     check_input_shape(BatchNormAttrs const &, TensorShape const &input_shape) {
-  if (num_dims(input_shape) < 2) {
+  if (get_num_dims(input_shape.dims) < 2) {
     return fmt::format(
         "BatchNormAttrs expected input dims >= 2, but received input shape {}",
         input_shape);
@@ -68,7 +69,8 @@ tl::expected<TensorShape, std::string>
     return tl::unexpected("No gamma weights exist for attrs.affine = false");
   }
 
-  positive_int num_channels = dim_at_idx(input_shape, relative_ff_dim_t{1});
+  positive_int num_channels =
+      dim_at_idx(input_shape.dims, relative_ff_dim_t{1});
 
   return TensorShape{
       TensorDims{FFOrdered<positive_int>{
diff --git a/lib/op-attrs/src/op-attrs/ops/broadcast.cc b/lib/op-attrs/src/op-attrs/ops/broadcast.cc
index 31e241e27b..d84a9ee46e 100644
--- a/lib/op-attrs/src/op-attrs/ops/broadcast.cc
+++ b/lib/op-attrs/src/op-attrs/ops/broadcast.cc
@@ -13,7 +13,7 @@ RecordFormatter as_dot(BroadcastAttrs const &attrs) {
     return rr;
   };
 
-  for (int i = 0; i < num_dims(attrs.target_dims); i++) {
+  for (int i = 0; i < get_num_dims(attrs.target_dims); i++) {
     r << kv(fmt::format("target_dims[{}]", i),
             dim_at_idx(attrs.target_dims, relative_ff_dim_t{i}));
   }
@@ -24,7 +24,7 @@ RecordFormatter as_dot(BroadcastAttrs const &attrs) {
 tl::expected<TensorShape, std::string>
     get_output_shape(BroadcastAttrs const &attrs,
                      TensorShape const &input_shape) {
-  if (num_dims(attrs.target_dims) < num_dims(input_shape.dims)) {
+  if (get_num_dims(attrs.target_dims) < get_num_dims(input_shape.dims)) {
     return tl::unexpected(fmt::format(
         "get_output_shape for Broadcast expected num_dims(input_dims) <= "
         "num_dims(target_dims), but recieved input_shape {} with num dims "
diff --git a/lib/op-attrs/src/op-attrs/ops/concat.cc b/lib/op-attrs/src/op-attrs/ops/concat.cc
index aed118dd62..8f2752b71f 100644
--- a/lib/op-attrs/src/op-attrs/ops/concat.cc
+++ b/lib/op-attrs/src/op-attrs/ops/concat.cc
@@ -33,7 +33,7 @@ tl::expected<TensorShape, std::string>
   }
 
   if (!are_all_same(transform(
-          inputs, [](TensorShape const &s) { return num_dims(s); }))) {
+          inputs, [](TensorShape const &s) { return get_num_dims(s.dims); }))) {
     return tl::unexpected(
         fmt::format("get_output_shape for Concat expected all inputs to have "
                     "the same number of dimensions, but receieved {}",
@@ -51,7 +51,7 @@ tl::expected<TensorShape, std::string>
 
   std::vector<positive_int> axis_dim_sizes =
       transform(inputs, [&](TensorShape const &s) {
-        return dim_at_idx(s, relative_ff_dim_t_from_ff_dim_t(attrs.axis));
+        return dim_at_idx(s.dims, relative_ff_dim_t_from_ff_dim_t(attrs.axis));
       });
 
   positive_int output_axis_dim_size = sum(axis_dim_sizes);
diff --git a/lib/op-attrs/src/op-attrs/ops/conv_2d.cc b/lib/op-attrs/src/op-attrs/ops/conv_2d.cc
index 2ac90c1c9c..6ff1b8a06e 100644
--- a/lib/op-attrs/src/op-attrs/ops/conv_2d.cc
+++ b/lib/op-attrs/src/op-attrs/ops/conv_2d.cc
@@ -115,7 +115,7 @@ ParallelTensorShape get_kernel_shape(Conv2DAttrs const &attrs,
   SumDegree sum_degree = SumDegree{1_p};
   DiscardCopyDegree discard_copy_degree =
       DiscardCopyDegree{parsed.sample_dim.degree * parsed.sum_reduction_degree};
-  FFOrdered<positive_int> shard_degrees = {
+  FFOrdered<positive_int> shard_degrees = FFOrdered{
       parsed.discard_copy_reduction_degree,
       parsed.channel_dim.degree,
       1_p,
@@ -139,7 +139,7 @@ ParallelTensorShape get_bias_shape(Conv2DAttrs const &attrs,
   DiscardCopyDegree discard_copy_degree =
       DiscardCopyDegree{parsed.height_dim.degree * parsed.width_dim.degree *
                         parsed.sample_dim.degree};
-  FFOrdered<positive_int> shard_degrees = {
+  FFOrdered<positive_int> shard_degrees = FFOrdered{
       parsed.discard_copy_reduction_degree,
   };
 
@@ -161,7 +161,7 @@ ParallelTensorShape get_output_shape(Conv2DAttrs const &attrs,
   SumDegree sum_degree =
       SumDegree{parsed.sum_reduction_degree * parsed.channel_dim.degree};
   DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1_p};
-  FFOrdered<positive_int> shard_degrees = {
+  FFOrdered<positive_int> shard_degrees = FFOrdered{
       parsed.sample_dim.degree,
       parsed.discard_copy_reduction_degree,
       1_p,
diff --git a/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc b/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc
index 75db5c56fb..79bb14f2b2 100644
--- a/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc
+++ b/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc
@@ -1,22 +1,23 @@
 #include "op-attrs/ops/conv_2d/conv_2d_input_shape.h"
+#include "op-attrs/tensor_dims.h"
 #include "op-attrs/tensor_shape.h"
 
 namespace FlexFlow {
 
 Conv2DInputShape parse_input_shape(TensorShape const &input) {
-  assert(num_dims(input) == 4);
+  ASSERT(get_num_dims(input.dims) == 4);
 
-  positive_int num_samples = dim_at_idx(input, relative_ff_dim_t{0});
-  positive_int in_channels = dim_at_idx(input, relative_ff_dim_t{1});
-  positive_int in_height = dim_at_idx(input, relative_ff_dim_t{2});
-  positive_int in_width = dim_at_idx(input, relative_ff_dim_t{3});
+  positive_int num_samples = dim_at_idx(input.dims, relative_ff_dim_t{0});
+  positive_int in_channels = dim_at_idx(input.dims, relative_ff_dim_t{1});
+  positive_int in_height = dim_at_idx(input.dims, relative_ff_dim_t{2});
+  positive_int in_width = dim_at_idx(input.dims, relative_ff_dim_t{3});
 
   return Conv2DInputShape{
-      num_samples,
-      in_channels,
-      in_height,
-      in_width,
-      input.data_type,
+      /*num_samples=*/num_samples,
+      /*num_channels=*/in_channels,
+      /*height=*/in_height,
+      /*width=*/in_width,
+      /*datatype=*/input.data_type,
   };
 }
 
diff --git a/lib/op-attrs/src/op-attrs/ops/embedding.cc b/lib/op-attrs/src/op-attrs/ops/embedding.cc
index 809b4cdaf9..e0e1a44b3b 100644
--- a/lib/op-attrs/src/op-attrs/ops/embedding.cc
+++ b/lib/op-attrs/src/op-attrs/ops/embedding.cc
@@ -3,6 +3,7 @@
 #include "op-attrs/ff_ordered/transform.h"
 #include "op-attrs/ops/embedding_attrs.dtg.h"
 #include "op-attrs/parallel_tensor_dims.h"
+#include "op-attrs/tensor_dims.h"
 #include "utils/containers/product.h"
 #include "utils/fmt/optional.h"
 #include "utils/integer_conversions.h"
@@ -52,7 +53,7 @@ tl::expected<TensorShape, std::string>
   }
 
   TensorShape output = input;
-  dim_at_idx(output, relative_ff_dim_t{-1}) = attrs.out_channels;
+  dim_at_idx(output.dims, relative_ff_dim_t{-1}) = attrs.out_channels;
   output.data_type = attrs.data_type;
   return output;
 }
@@ -120,7 +121,7 @@ tl::expected<ParallelTensorShape, std::string>
       [](ShardParallelDim const &d) -> positive_int { return d.degree; }))};
   positive_int entry_dim_degree = 1_p;
   positive_int out_channel_degree = get_discard_copy_degree(input);
-  FFOrdered<positive_int> shard_degrees = {
+  FFOrdered<positive_int> shard_degrees = FFOrdered{
       entry_dim_degree,
       out_channel_degree,
   };
diff --git a/lib/op-attrs/src/op-attrs/ops/flat.cc b/lib/op-attrs/src/op-attrs/ops/flat.cc
index a2183a71b4..14180cecf8 100644
--- a/lib/op-attrs/src/op-attrs/ops/flat.cc
+++ b/lib/op-attrs/src/op-attrs/ops/flat.cc
@@ -26,7 +26,7 @@ TensorShape get_output_shape(FlatAttrs const &attrs,
       TensorDims{
           concat(std::vector{
               leading_dims,
-              {product(flattened_dims)},
+              FFOrdered{product(flattened_dims)},
               trailing_dims,
           }),
       },
@@ -59,7 +59,7 @@ tl::expected<ParallelTensorDimDegrees, std::string>
       /*shard_degrees=*/
       concat(std::vector{
           slice(input_degrees.shard_degrees, ff_dim_t{0_n}, attrs.start_dim),
-          {product(flattened_dim_degrees)},
+          FFOrdered{product(flattened_dim_degrees)},
           slice(input_degrees.shard_degrees, attrs.end_dim, std::nullopt),
       }),
   };
diff --git a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc
index 3637aacc5c..e0db1cdfe7 100644
--- a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc
+++ b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc
@@ -2,6 +2,7 @@
 #include "op-attrs/ff_ordered/ff_ordered_of.h"
 #include "op-attrs/ff_ordered/get_idxs.h"
 #include "op-attrs/parallel_tensor_shape.h"
+#include "op-attrs/tensor_dims.h"
 #include "op-attrs/tensor_shape.h"
 #include "utils/containers/all_of.h"
 #include "utils/containers/any_of.h"
@@ -9,6 +10,7 @@
 #include "utils/containers/extend.h"
 #include "utils/containers/filter.h"
 #include "utils/expected.h"
+#include "utils/fmt/set.h"
 
 namespace FlexFlow {
 
@@ -28,7 +30,7 @@ static std::optional<std::string>
     check_input_shape(LayerNormAttrs const &attrs,
                       TensorShape const &input_shape) {
   if (any_of(attrs.axes, [&](ff_dim_t axis) {
-        return axis.value >= num_dims(input_shape);
+        return axis.value >= get_num_dims(input_shape.dims);
       })) {
     return fmt::format(
         "LayerNorm axes {} out-of-bounds for input tensor shape {}",
@@ -74,7 +76,7 @@ tl::expected<TensorShape, std::string>
       [&](ff_dim_t const &dim_idx) { return !contains(attrs.axes, dim_idx); });
   std::vector<positive_int> raw_weight_dims =
       transform(non_layer_norm_dim_idxs, [&](ff_dim_t const &dim_idx) {
-        return dim_at_idx(input_shape,
+        return dim_at_idx(input_shape.dims,
                           relative_ff_dim_t_from_ff_dim_t(dim_idx));
       });
 
diff --git a/lib/op-attrs/src/op-attrs/ops/linear.cc b/lib/op-attrs/src/op-attrs/ops/linear.cc
index 32791e81a9..37f504f873 100644
--- a/lib/op-attrs/src/op-attrs/ops/linear.cc
+++ b/lib/op-attrs/src/op-attrs/ops/linear.cc
@@ -3,6 +3,7 @@
 #include "op-attrs/ff_ordered/transform.h"
 #include "op-attrs/initializers/kaiming_initializer_mode.h"
 #include "op-attrs/parallel_tensor_shape.h"
+#include "op-attrs/tensor_dims.h"
 #include "op-attrs/tensor_shape.h"
 #include "utils/containers/product.h"
 #include "utils/expected.h"
@@ -44,11 +45,12 @@ RecordFormatter as_dot(LinearAttrs const &attrs) {
 tl::expected<TensorShape, std::string>
     get_projection_shape(LinearAttrs const &attrs,
                          TensorShape const &input_shape) {
-  positive_int in_channels = dim_at_idx(input_shape, relative_ff_dim_t{-1});
+  positive_int in_channels =
+      dim_at_idx(input_shape.dims, relative_ff_dim_t{-1});
 
   return TensorShape{
       TensorDims{
-          FFOrdered<positive_int>{in_channels, attrs.out_channels},
+          FFOrdered<positive_int>{attrs.out_channels, in_channels},
       },
       input_shape.data_type,
   };
@@ -105,8 +107,8 @@ tl::expected<ParallelTensorShape, std::string>
                                             relative_ff_dim_t{0},
                                             relative_ff_dim_t{-1}))};
   FFOrdered<positive_int> shard_degrees = FFOrdered<positive_int>{
-      shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree,
       get_discard_copy_degree(input),
+      shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree,
   };
 
   return lift_to_parallel_with_degrees(
diff --git a/lib/op-attrs/src/op-attrs/ops/pool_2d.cc b/lib/op-attrs/src/op-attrs/ops/pool_2d.cc
index 361216cce4..ee75340ed0 100644
--- a/lib/op-attrs/src/op-attrs/ops/pool_2d.cc
+++ b/lib/op-attrs/src/op-attrs/ops/pool_2d.cc
@@ -15,7 +15,7 @@ tl::expected<Pool2DAttrs, std::string>
   // AdaptivePool2D semantics pulled from
   // https://stackoverflow.com/questions/53841509/how-does-adaptive-pooling-in-pytorch-work/63603993
 
-  if (num_dims(input_dims) != 4) {
+  if (get_num_dims(input_dims) != 4) {
     return tl::unexpected(
         fmt::format("make_adaptive_pool2d_attrs expected input tensor to "
                     "have 4 dims, but received dims {}",
@@ -119,17 +119,19 @@ static positive_int calculate_output_size(positive_int input_size,
 
 tl::expected<TensorShape, std::string>
     get_output_shape(Pool2DAttrs const &attrs, TensorShape const &input_shape) {
-  if (num_dims(input_shape) != 4) {
+  if (get_num_dims(input_shape.dims) != 4) {
     return tl::unexpected(
         fmt::format("get_output_shape for Pool2DAttrs expected input tensor to "
                     "have 4 dims, but received shape {}",
                     input_shape));
   }
 
-  positive_int num_samples = dim_at_idx(input_shape, relative_ff_dim_t{0});
-  positive_int num_channels = dim_at_idx(input_shape, relative_ff_dim_t{1});
-  positive_int input_height = dim_at_idx(input_shape, relative_ff_dim_t{2});
-  positive_int input_width = dim_at_idx(input_shape, relative_ff_dim_t{3});
+  positive_int num_samples = dim_at_idx(input_shape.dims, relative_ff_dim_t{0});
+  positive_int num_channels =
+      dim_at_idx(input_shape.dims, relative_ff_dim_t{1});
+  positive_int input_height =
+      dim_at_idx(input_shape.dims, relative_ff_dim_t{2});
+  positive_int input_width = dim_at_idx(input_shape.dims, relative_ff_dim_t{3});
 
   positive_int output_height =
       calculate_output_size(/*input_size=*/input_height,
diff --git a/lib/op-attrs/src/op-attrs/ops/softmax.cc b/lib/op-attrs/src/op-attrs/ops/softmax.cc
index 0d55a2ec2c..2c03fe1689 100644
--- a/lib/op-attrs/src/op-attrs/ops/softmax.cc
+++ b/lib/op-attrs/src/op-attrs/ops/softmax.cc
@@ -1,5 +1,6 @@
 #include "op-attrs/ops/softmax.h"
 #include "op-attrs/parallel_tensor_shape.h"
+#include "op-attrs/tensor_dims.h"
 #include "op-attrs/tensor_shape.h"
 
 namespace FlexFlow {
@@ -7,7 +8,7 @@ namespace FlexFlow {
 tl::expected<TensorShape, std::string>
     get_output_shape(SoftmaxAttrs const &attrs,
                      TensorShape const &input_shape) {
-  if (attrs.dim.value >= num_dims(input_shape)) {
+  if (attrs.dim.value >= get_num_dims(input_shape.dims)) {
     return tl::unexpected(
         fmt::format("get_output_shape for Softmax received out-of-bounds "
                     "attrs.dim {} for input tensor shape {}",
diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
index dd5230f5a4..1c77bc6ca8 100644
--- a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
+++ b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
@@ -43,9 +43,11 @@ ParallelTensorDimDegrees get_parallel_degrees(ParallelTensorDims const &d) {
 
 ParallelTensorDims lift_to_parallel(TensorDims const &dims) {
   std::vector<positive_int> shard_degrees =
-      repeat_element(/*num_times=*/num_dims(dims), /*element=*/1_p);
-  return lift_to_parallel_with_degrees(
-      dims, SumDegree{1_p}, DiscardCopyDegree{1_p}, shard_degrees);
+      repeat_element(/*num_times=*/get_num_dims(dims), /*element=*/1_p);
+  return lift_to_parallel_with_degrees(dims,
+                                       SumDegree{1_p},
+                                       DiscardCopyDegree{1_p},
+                                       ff_ordered_of(shard_degrees));
 }
 
 ParallelTensorDims lift_to_parallel_with_degrees(
@@ -61,7 +63,7 @@ ParallelTensorDims lift_to_parallel_with_degrees(
                   return ShardParallelDim{size, degree};
                 });
 
-  return ParallelTensorDims{FFOrdered<ShardParallelDim>{lifted},
+  return ParallelTensorDims{ff_ordered_of(lifted),
                             ReplicaParallelDimSet{
                                 sum_degree,
                                 discard_copy_degree,
diff --git a/lib/op-attrs/src/op-attrs/tensor_dims.cc b/lib/op-attrs/src/op-attrs/tensor_dims.cc
index b48a23b281..435f211a01 100644
--- a/lib/op-attrs/src/op-attrs/tensor_dims.cc
+++ b/lib/op-attrs/src/op-attrs/tensor_dims.cc
@@ -1,15 +1,24 @@
 #include "op-attrs/tensor_dims.h"
+#include "op-attrs/ff_ordered/enumerate.h"
+#include "op-attrs/ff_ordered/filtrans.h"
+#include "op-attrs/ff_ordered/get_idxs.h"
 #include "op-attrs/ff_ordered/slice.h"
 #include "op-attrs/ff_ordered/zip.h"
+#include "op-attrs/ff_ordered/zip_with.h"
 #include "op-attrs/replica_parallel_dim_set.h"
 #include "op-attrs/shard_parallel_dim.dtg.h"
+#include "utils/containers/all_are_true.h"
 #include "utils/containers/all_of.h"
+#include "utils/containers/cartesian_product.h"
+#include "utils/containers/contains.h"
 #include "utils/containers/product.h"
 #include "utils/containers/reversed.h"
 #include "utils/containers/transform.h"
+#include "utils/containers/unordered_set_of.h"
 #include "utils/containers/vector_of.h"
 #include "utils/containers/zip.h"
 #include "utils/integer_conversions.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
 #include "utils/nonnegative_int/num_elements.h"
 
 namespace FlexFlow {
@@ -18,7 +27,11 @@ FFOrdered<positive_int> const &ff_ordered(TensorDims const &dims) {
   return dims.ff_ordered;
 }
 
-nonnegative_int num_dims(TensorDims const &dims) {
+bool tensor_dims_has_dim(TensorDims const &tensor_dims, ff_dim_t dim) {
+  return contains(get_idxs(tensor_dims.ff_ordered), dim);
+}
+
+nonnegative_int get_num_dims(TensorDims const &dims) {
   return num_elements(dims.ff_ordered);
 }
 
@@ -30,13 +43,39 @@ positive_int &dim_at_idx(TensorDims &dims, relative_ff_dim_t idx) {
   return dims.ff_ordered.at(idx);
 }
 
+positive_int dim_at_idx(TensorDims const &dims, ff_dim_t ff_dim_idx) {
+  return dims.ff_ordered.at(ff_dim_idx);
+}
+
+positive_int &dim_at_idx(TensorDims &dims, ff_dim_t ff_dim_idx) {
+  return dims.ff_ordered.at(ff_dim_idx);
+}
+
+std::optional<positive_int> try_dim_at_idx(TensorDims const &dims,
+                                           relative_ff_dim_t idx) {
+  if (dims.ff_ordered.idx_is_valid(idx)) {
+    return dims.ff_ordered.at(idx);
+  } else {
+    return std::nullopt;
+  }
+}
+
+std::optional<positive_int> try_dim_at_idx(TensorDims const &dims,
+                                           ff_dim_t idx) {
+  if (dims.ff_ordered.idx_is_valid(idx)) {
+    return dims.ff_ordered.at(idx);
+  } else {
+    return std::nullopt;
+  }
+}
+
 positive_int get_num_elements(TensorDims const &d) {
   return product(d.ff_ordered);
 }
 
 bool tensor_dims_is_broadcastable_to(TensorDims const &curr,
                                      TensorDims const &goal) {
-  if (num_dims(curr) > num_dims(goal)) {
+  if (get_num_dims(curr) > get_num_dims(goal)) {
     return false;
   }
 
@@ -53,6 +92,80 @@ bool tensor_dims_is_broadcastable_to(TensorDims const &curr,
   return true;
 }
 
+bool tensor_dims_contains_coord(TensorDims const &tensor_dims,
+                                TensorDimsCoord const &coord) {
+  ASSERT(coord.ff_ordered.size() == get_num_dims(tensor_dims));
+
+  return all_are_true(zip_with(
+      coord.ff_ordered,
+      tensor_dims.ff_ordered,
+      [](nonnegative_int const &coord_entry, positive_int const &dim_size) {
+        return coord_entry < dim_size;
+      }));
+}
+
+TensorDimsCoord get_broadcast_src_coord(TensorDims const &input_dims,
+                                        TensorDims const &output_dims,
+                                        TensorDimsCoord const &dst_coord) {
+  ASSERT(tensor_dims_contains_coord(output_dims, dst_coord),
+         output_dims,
+         dst_coord);
+  ASSERT(tensor_dims_is_broadcastable_to(input_dims, output_dims),
+         input_dims,
+         output_dims);
+
+  relative_ff_dim_t trailing_start_idx =
+      relative_ff_dim_t{-1 * get_num_dims(input_dims).unwrap_nonnegative()};
+
+  FFOrdered<nonnegative_int> trailing_entries =
+      slice(dst_coord.ff_ordered, trailing_start_idx);
+
+  FFOrdered<positive_int> trailing_dims =
+      slice(output_dims.ff_ordered, trailing_start_idx);
+
+  TensorDimsCoord result = TensorDimsCoord{
+      zip_with(trailing_entries,
+               input_dims.ff_ordered,
+               [](nonnegative_int const &coord_entry,
+                  positive_int const &input_dim_size) {
+                 if (input_dim_size == 1) {
+                   return 0_n;
+                 } else {
+                   return coord_entry;
+                 }
+               }),
+  };
+
+  ASSERT(tensor_dims_contains_coord(input_dims, result),
+         output_dims,
+         dst_coord,
+         input_dims,
+         result);
+
+  return result;
+}
+
+std::unordered_set<TensorDimsCoord>
+    get_tensor_dims_coord_set(TensorDims const &tensor_dims) {
+  std::vector<std::vector<nonnegative_int>> per_dim_ranges = transform(
+      vector_of(tensor_dims.ff_ordered),
+      [](positive_int dim_size) -> std::vector<nonnegative_int> {
+        return nonnegative_range(dim_size.nonnegative_int_from_positive_int());
+      });
+
+  std::unordered_set<std::vector<nonnegative_int>> raw_points =
+      unordered_set_of(cartesian_product(per_dim_ranges));
+
+  return transform(raw_points,
+                   [](std::vector<nonnegative_int> const &raw_point) {
+                     return TensorDimsCoord{ff_ordered_of(raw_point)};
+                   });
+}
+
+std::unordered_set<ff_dim_t> get_ff_dim_t_set(TensorDims const &tensor_dims) {
+  return unordered_set_of(get_idxs(tensor_dims.ff_ordered));
+}
+
 std::optional<TensorDims>
     get_broadcast_target_dims(std::unordered_set<TensorDims> const &dims) {
   for (TensorDims target_candidate : dims) {
@@ -66,6 +179,19 @@ std::optional<TensorDims>
   return std::nullopt;
 }
 
+TensorDims tensor_dims_drop_dims(
+    TensorDims const &dims,
+    std::function<bool(ff_dim_t)> const &should_drop_dim) {
+  std::vector<positive_int> result;
+  for (ff_dim_t idx : get_idxs(dims.ff_ordered)) {
+    if (!should_drop_dim(idx)) {
+      result.push_back(dims.ff_ordered.at(idx));
+    }
+  }
+
+  return TensorDims{ff_ordered_of(result)};
+}
+
 TensorDims slice_tensor_dims(TensorDims const &dims,
                              relative_ff_dim_t const &start,
                              std::optional<relative_ff_dim_t> const &stop) {
@@ -74,4 +200,12 @@ TensorDims slice_tensor_dims(TensorDims const &dims,
   };
 }
 
+TensorDims slice_tensor_dims(TensorDims const &dims,
+                             ff_dim_t const &start,
+                             std::optional<ff_dim_t> const &stop) {
+  return TensorDims{
+      slice(dims.ff_ordered, start, stop),
+  };
+}
+
 } // namespace FlexFlow
diff --git a/lib/kernels/src/kernels/array_coord.cc b/lib/op-attrs/src/op-attrs/tensor_dims_coord.cc
similarity index 53%
rename from lib/kernels/src/kernels/array_coord.cc
rename to lib/op-attrs/src/op-attrs/tensor_dims_coord.cc
index 0927cb9951..6cdf5711ed 100644
--- a/lib/kernels/src/kernels/array_coord.cc
+++ b/lib/op-attrs/src/op-attrs/tensor_dims_coord.cc
@@ -1,12 +1,16 @@
-#include "kernels/array_coord.h"
+#include "op-attrs/tensor_dims_coord.h"
 #include "op-attrs/ff_ordered/ff_ordered_of.h"
 #include "op-attrs/ff_ordered/get_idxs.h"
-#include <vector>
 
 namespace FlexFlow {
 
-ArrayCoord array_coord_drop_dims(
-    ArrayCoord const &coord,
+nonnegative_int
+    tensor_dims_coord_get_num_dims(TensorDimsCoord const &tensor_dims_coord) {
+  return nonnegative_int{tensor_dims_coord.ff_ordered.size()};
+}
+
+TensorDimsCoord tensor_dims_coord_drop_dims(
+    TensorDimsCoord const &coord,
     std::function<bool(ff_dim_t)> const &should_drop_dim) {
   std::vector<nonnegative_int> result;
   for (ff_dim_t idx : get_idxs(coord.ff_ordered)) {
@@ -15,7 +19,7 @@ ArrayCoord array_coord_drop_dims(
     }
   }
 
-  return ArrayCoord{ff_ordered_of(result)};
+  return TensorDimsCoord{ff_ordered_of(result)};
 }
 
 } // namespace FlexFlow
diff --git a/lib/op-attrs/src/op-attrs/tensor_shape.cc b/lib/op-attrs/src/op-attrs/tensor_shape.cc
index 7a1ba810a7..270ebb9e0c 100644
--- a/lib/op-attrs/src/op-attrs/tensor_shape.cc
+++ b/lib/op-attrs/src/op-attrs/tensor_shape.cc
@@ -8,24 +8,18 @@
 
 namespace FlexFlow {
 
-nonnegative_int num_dims(TensorShape const &s) {
-  return num_elements(s.dims.ff_ordered);
+num_bytes_t get_size_in_bytes(TensorShape const &s) {
+  return num_bytes_t{(get_num_elements(s.dims) * size_of_datatype(s.data_type))
+                         .nonnegative_int_from_positive_int()};
 }
 
-positive_int dim_at_idx(TensorShape const &s, relative_ff_dim_t idx) {
-  return dim_at_idx(s.dims, idx);
-}
-
-positive_int &dim_at_idx(TensorShape &s, relative_ff_dim_t idx) {
-  return dim_at_idx(s.dims, idx);
-}
-
-positive_int get_num_elements(TensorShape const &s) {
-  return get_num_elements(s.dims);
-}
-
-positive_int get_size_in_bytes(TensorShape const &s) {
-  return get_num_elements(s) * size_of_datatype(s.data_type);
+TensorShape tensor_shape_drop_dims(
+    TensorShape const &input_shape,
+    std::function<bool(ff_dim_t)> const &should_drop_dim) {
+  return TensorShape{
+      /*dims=*/tensor_dims_drop_dims(input_shape.dims, should_drop_dim),
+      /*data_type=*/input_shape.data_type,
+  };
 }
 
 TensorShape slice_tensor_shape(TensorShape const &shape,
diff --git a/lib/op-attrs/test/src/op-attrs/datatype_value.cc b/lib/op-attrs/test/src/op-attrs/datatype_value.cc
index 9b0e90b601..140cdaae6f 100644
--- a/lib/op-attrs/test/src/op-attrs/datatype_value.cc
+++ b/lib/op-attrs/test/src/op-attrs/datatype_value.cc
@@ -1,15 +1,30 @@
 #include "op-attrs/datatype_value.h"
+#include "test/utils/doctest/fmt/half.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("test make_data_type_value") {
+    SUBCASE("make_half_data_type_value") {
+      half value = 3.0f;
+      DataTypeValue data_type_value = make_half_data_type_value(value);
+
+      CHECK(data_type_value.has<half>());
+      CHECK_FALSE(data_type_value.has<float>());
+      CHECK_FALSE(data_type_value.has<double>());
+      CHECK_FALSE(data_type_value.has<int32_t>());
+      CHECK_FALSE(data_type_value.has<int64_t>());
+      CHECK_FALSE(data_type_value.has<bool>());
+      CHECK(data_type_value.get<half>() == value);
+    }
+
     SUBCASE("make_float_data_type_value") {
       float value = 1.0f;
       DataTypeValue data_type_value = make_float_data_type_value(value);
 
       CHECK(data_type_value.has<float>());
+      CHECK_FALSE(data_type_value.has<half>());
       CHECK_FALSE(data_type_value.has<double>());
       CHECK_FALSE(data_type_value.has<int32_t>());
       CHECK_FALSE(data_type_value.has<int64_t>());
@@ -22,6 +37,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       DataTypeValue data_type_value = make_double_data_type_value(value);
 
       CHECK(data_type_value.has<double>());
+      CHECK_FALSE(data_type_value.has<half>());
       CHECK_FALSE(data_type_value.has<float>());
       CHECK_FALSE(data_type_value.has<int32_t>());
       CHECK_FALSE(data_type_value.has<int64_t>());
@@ -34,6 +50,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       DataTypeValue data_type_value = make_int32_data_type_value(value);
 
       CHECK(data_type_value.has<int32_t>());
+      CHECK_FALSE(data_type_value.has<half>());
       CHECK_FALSE(data_type_value.has<float>());
       CHECK_FALSE(data_type_value.has<double>());
       CHECK_FALSE(data_type_value.has<int64_t>());
@@ -46,6 +63,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       DataTypeValue data_type_value = make_int64_data_type_value(value);
 
       CHECK(data_type_value.has<int64_t>());
+      CHECK_FALSE(data_type_value.has<half>());
       CHECK_FALSE(data_type_value.has<float>());
       CHECK_FALSE(data_type_value.has<double>());
       CHECK_FALSE(data_type_value.has<int32_t>());
@@ -58,6 +76,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       DataTypeValue data_type_value = make_bool_data_type_value(value);
 
       CHECK(data_type_value.has<bool>());
+      CHECK_FALSE(data_type_value.has<half>());
       CHECK_FALSE(data_type_value.has<float>());
       CHECK_FALSE(data_type_value.has<double>());
       CHECK_FALSE(data_type_value.has<int32_t>());
@@ -65,4 +84,60 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(data_type_value.get<bool>() == value);
     }
   }
+
+  TEST_CASE("get_data_type_of_data_type_value") {
+    SUBCASE("half") {
+      DataTypeValue input = make_half_data_type_value(0.0);
+
+      DataType result = get_data_type_of_data_type_value(input);
+      DataType correct = DataType::HALF;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("float") {
+      DataTypeValue input = make_float_data_type_value(0.0);
+
+      DataType result = get_data_type_of_data_type_value(input);
+      DataType correct = DataType::FLOAT;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("double") {
+      DataTypeValue input = make_double_data_type_value(0.0);
+
+      DataType result = get_data_type_of_data_type_value(input);
+      DataType correct = DataType::DOUBLE;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("int32") {
+      DataTypeValue input = make_int32_data_type_value(0);
+
+      DataType result = get_data_type_of_data_type_value(input);
+      DataType correct = DataType::INT32;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("int64") {
+      DataTypeValue input = make_int64_data_type_value(0);
+
+      DataType result = get_data_type_of_data_type_value(input);
+      DataType correct = DataType::INT64;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("bool") {
+      DataTypeValue input = make_bool_data_type_value(false);
+
+      DataType result = get_data_type_of_data_type_value(input);
+      DataType correct = DataType::BOOL;
+
+      CHECK(result == correct);
+    }
+  }
 }
diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/concat.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/concat.cc
index d8e04124bc..1743ebb86e 100644
--- a/lib/op-attrs/test/src/op-attrs/ff_ordered/concat.cc
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/concat.cc
@@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       FFOrdered<int> r_input = FFOrdered<int>{2, 1};
 
       FFOrdered<int> result = concat(l_input, r_input);
-      FFOrdered<int> correct = {1, 3, 1, 2, 1};
+      FFOrdered<int> correct = FFOrdered{1, 3, 1, 2, 1};
 
       CHECK(result == correct);
     }
@@ -29,13 +29,13 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("concat(std::vector<FFOrdered<T>>)") {
     SUBCASE("inputs have elements") {
       std::vector<FFOrdered<int>> input = {
-          {1},
-          {2, 1},
-          {1},
+          FFOrdered{1},
+          FFOrdered{2, 1},
+          FFOrdered{1},
       };
 
       FFOrdered<int> result = concat(input);
-      FFOrdered<int> correct = {
+      FFOrdered<int> correct = FFOrdered{
           1,
           2,
           1,
@@ -55,10 +55,14 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("inputs are empty") {
-      std::vector<FFOrdered<int>> input = {{}, {}, {}};
+      std::vector<FFOrdered<int>> input = {
+          FFOrdered<int>{},
+          FFOrdered<int>{},
+          FFOrdered<int>{},
+      };
 
       FFOrdered<int> result = concat(input);
-      FFOrdered<int> correct = {};
+      FFOrdered<int> correct = FFOrdered<int>{};
 
       CHECK(result == correct);
     }
diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/enumerate.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/enumerate.cc
index e1a94e72c3..c8566b6de4 100644
--- a/lib/op-attrs/test/src/op-attrs/ff_ordered/enumerate.cc
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/enumerate.cc
@@ -6,7 +6,7 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("enumerate(FFOrdered<T>)") {
-    FFOrdered<std::string> input = {"zero", "one", "two"};
+    FFOrdered<std::string> input = FFOrdered<std::string>{"zero", "one", "two"};
 
     std::map<ff_dim_t, std::string> result = enumerate(input);
     std::map<ff_dim_t, std::string> correct = {
diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered_from_map.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered_from_map.cc
index 73036d5662..49bc13cf8e 100644
--- a/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered_from_map.cc
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered_from_map.cc
@@ -48,7 +48,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       };
 
       FFOrdered<int> result = ff_ordered_from_map(m);
-      FFOrdered<int> correct = {4, 5, 2, 7};
+      FFOrdered<int> correct = FFOrdered<int>{4, 5, 2, 7};
 
       CHECK(result == correct);
     }
diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/reversed.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/reversed.cc
new file mode 100644
index 0000000000..944248d37b
--- /dev/null
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/reversed.cc
@@ -0,0 +1,26 @@
+#include "op-attrs/ff_ordered/reversed.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("reversed(FFOrdered<T>)") {
+    SUBCASE("non-empty input") {
+      FFOrdered<int> input = FFOrdered<int>{1, 2, 3, 2};
+
+      FFOrdered<int> result = reversed(input);
+      FFOrdered<int> correct = FFOrdered<int>{2, 3, 2, 1};
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("empty input") {
+      FFOrdered<int> input = {};
+
+      FFOrdered<int> result = reversed(input);
+      FFOrdered<int> correct = {};
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc
index 4bf189ec77..2c5c89db29 100644
--- a/lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc
@@ -18,7 +18,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("input is not empty") {
-      FFOrdered<int> input = {2, 1, 2, 5};
+      FFOrdered<int> input = FFOrdered{2, 1, 2, 5};
 
       FFOrdered<std::string> result =
           transform(input, [](int x) { return fmt::to_string(x); });
diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc
index 19167cd0ff..4b14bcd134 100644
--- a/lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc
@@ -6,18 +6,20 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("zip(FFOrdered<T1>, FFOrdered<T2>)") {
-    FFOrdered<int> lhs_input = {9, 9, 8, 9};
-    FFOrdered<std::string> rhs_input = {"m", "m", "k", "l", "m"};
+    FFOrdered<int> lhs_input = FFOrdered<int>{9, 9, 8, 9};
+    FFOrdered<std::string> rhs_input =
+        FFOrdered<std::string>{"m", "m", "k", "l", "m"};
 
     SUBCASE("lhs is longer") {
       FFOrdered<std::pair<int, std::string>> result = zip(lhs_input, rhs_input);
 
-      FFOrdered<std::pair<int, std::string>> correct = {
-          {9, "m"},
-          {9, "m"},
-          {8, "k"},
-          {9, "l"},
-      };
+      FFOrdered<std::pair<int, std::string>> correct =
+          FFOrdered<std::pair<int, std::string>>{
+              {9, "m"},
+              {9, "m"},
+              {8, "k"},
+              {9, "l"},
+          };
 
       CHECK(result == correct);
     }
@@ -25,12 +27,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("rhs is longer") {
       FFOrdered<std::pair<std::string, int>> result = zip(rhs_input, lhs_input);
 
-      FFOrdered<std::pair<std::string, int>> correct = {
-          {"m", 9},
-          {"m", 9},
-          {"k", 8},
-          {"l", 9},
-      };
+      FFOrdered<std::pair<std::string, int>> correct =
+          FFOrdered<std::pair<std::string, int>>{
+              {"m", 9},
+              {"m", 9},
+              {"k", 8},
+              {"l", 9},
+          };
 
       CHECK(result == correct);
     }
diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/zip_with.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/zip_with.cc
new file mode 100644
index 0000000000..d61f709629
--- /dev/null
+++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/zip_with.cc
@@ -0,0 +1,80 @@
+#include "op-attrs/ff_ordered/zip_with.h"
+#include "test/utils/doctest/fmt/pair.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("zip_with(FFOrdered<T1>, FFOrdered<T2>, F)") {
+    SUBCASE("result types and input types are all different") {
+      FFOrdered<int> v1 = FFOrdered<int>{1, 3, 4, 3};
+      FFOrdered<std::string> v2 =
+          FFOrdered<std::string>{"aa", "cc", "bb", "dd"};
+
+      FFOrdered<std::pair<int, std::string>> result =
+          zip_with(v1, v2, [](int x1, std::string const &x2) {
+            return std::make_pair(x1, x2);
+          });
+      FFOrdered<std::pair<int, std::string>> correct =
+          FFOrdered<std::pair<int, std::string>>{
+              {1, "aa"},
+              {3, "cc"},
+              {4, "bb"},
+              {3, "dd"},
+          };
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("input lengths don't match") {
+      auto add = [](int x1, int x2) { return x1 + x2; };
+
+      FFOrdered<int> shorter = FFOrdered<int>{1, 2};
+      FFOrdered<int> longer = FFOrdered<int>{1, 3, 5, 7};
+
+      SUBCASE("first input is shorter") {
+        FFOrdered<int> result = zip_with(shorter, longer, add);
+        FFOrdered<int> correct = FFOrdered<int>{1 + 1, 2 + 3};
+
+        CHECK(result == correct);
+      }
+
+      SUBCASE("second input is shorter") {
+        FFOrdered<int> result = zip_with(longer, shorter, add);
+        FFOrdered<int> correct = FFOrdered<int>{1 + 1, 2 + 3};
+
+        CHECK(result == correct);
+      }
+    }
+
+    SUBCASE("properly handles empty inputs") {
+      FFOrdered<int> nonempty = FFOrdered<int>{1, 2};
+      FFOrdered<int> empty = {};
+
+      auto throw_err = [](int x1, int x2) -> int {
+        throw std::runtime_error("error");
+      };
+
+      SUBCASE("first input is empty") {
+        FFOrdered<int> result = zip_with(empty, nonempty, throw_err);
+        FFOrdered<int> correct = empty;
+
+        CHECK(result == correct);
+      }
+
+      SUBCASE("second input is empty") {
+        FFOrdered<int> result = zip_with(nonempty, empty, throw_err);
+        FFOrdered<int> correct = empty;
+
+        CHECK(result == correct);
+      }
+
+      SUBCASE("both inputs are empty") {
+        FFOrdered<int> result = zip_with(empty, empty, throw_err);
+        FFOrdered<int> correct = empty;
+
+        CHECK(result == correct);
+      }
+    }
+  }
+}
diff --git a/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc b/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc
index 4ef34c666e..72d499d20e 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc
@@ -1,5 +1,6 @@
 #include "op-attrs/ops/element_binary.h"
 #include "op-attrs/parallel_tensor_shape.h"
+#include "op-attrs/tensor_dims.h"
 #include "test/utils/doctest/fmt/expected.h"
 #include <doctest/doctest.h>
 
@@ -41,7 +42,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("mismatched dim size") {
       TensorShape incorrect_rhs = input_lhs;
-      dim_at_idx(incorrect_rhs, relative_ff_dim_t{0}) += 1_p;
+      dim_at_idx(incorrect_rhs.dims, relative_ff_dim_t{0}) += 1_p;
 
       tl::expected<TensorShape, std::string> result =
           get_output_shape(attrs, input_lhs, incorrect_rhs);
diff --git a/lib/op-attrs/test/src/op-attrs/ops/linear.cc b/lib/op-attrs/test/src/op-attrs/ops/linear.cc
index 61934fd1fe..4e0dd149ab 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/linear.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/linear.cc
@@ -85,8 +85,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorShape projection = TensorShape{
         TensorDims{
             FFOrdered{
-                in_channels,
                 out_channels,
+                in_channels,
             },
         },
         DataType::FLOAT,
@@ -145,10 +145,10 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto make_projection = [&](SumDegree o_sum,
                                DiscardCopyDegree o_eq,
-                               positive_int o_inchannel,
-                               positive_int o_outchannel) {
+                               positive_int o_outchannel,
+                               positive_int o_inchannel) {
       return lift_to_parallel_with_degrees(
-          projection, o_sum, o_eq, FFOrdered{o_inchannel, o_outchannel});
+          projection, o_sum, o_eq, FFOrdered{o_outchannel, o_inchannel});
     };
 
     auto make_bias = [&](SumDegree o_sum,
@@ -232,8 +232,8 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<ParallelTensorShape, std::string> correct =
             make_projection(SumDegree{1_p},
                             DiscardCopyDegree{input_sum_degree},
-                            degree,
-                            1_p);
+                            1_p,
+                            degree);
         CHECK(result == correct);
       }
 
@@ -274,8 +274,8 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<ParallelTensorShape, std::string> correct =
             make_projection(SumDegree{1_p},
                             DiscardCopyDegree{input_sum_degree},
-                            1_p,
-                            degree);
+                            degree,
+                            1_p);
         CHECK(result == correct);
       }
 
diff --git a/lib/op-attrs/test/src/op-attrs/tensor_dims.cc b/lib/op-attrs/test/src/op-attrs/tensor_dims.cc
index 7c559cf5a8..fc501873d9 100644
--- a/lib/op-attrs/test/src/op-attrs/tensor_dims.cc
+++ b/lib/op-attrs/test/src/op-attrs/tensor_dims.cc
@@ -1,10 +1,63 @@
 #include "op-attrs/tensor_dims.h"
 #include "test/utils/doctest/fmt/optional.h"
+#include "test/utils/doctest/fmt/unordered_set.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("tensor_dims_has_dim") {
+    SUBCASE("nonempty tensor_dims") {
+      TensorDims tensor_dims = TensorDims{FFOrdered{6_p, 9_p, 8_p}};
+
+      SUBCASE("does have dim") {
+        bool correct = true;
+        SUBCASE("leading dim") {
+          ff_dim_t dim = ff_dim_t{0_n};
+
+          bool result = tensor_dims_has_dim(tensor_dims, dim);
+
+          CHECK(result == correct);
+        }
+
+        SUBCASE("internal dim") {
+          ff_dim_t dim = ff_dim_t{1_n};
+
+          bool result = tensor_dims_has_dim(tensor_dims, dim);
+
+          CHECK(result == correct);
+        }
+
+        SUBCASE("trailing dim") {
+          ff_dim_t dim = ff_dim_t{2_n};
+
+          bool result = tensor_dims_has_dim(tensor_dims, ff_dim_t{1_n});
+
+          CHECK(result == correct);
+        }
+      }
+
+      SUBCASE("dim is too large") {
+        ff_dim_t dim = ff_dim_t{3_n};
+
+        bool result = tensor_dims_has_dim(tensor_dims, dim);
+        bool correct = false;
+
+        CHECK(result == correct);
+      }
+    }
+
+    SUBCASE("empty tensor_dims") {
+      TensorDims tensor_dims = TensorDims{FFOrdered<positive_int>{}};
+      ff_dim_t dim = ff_dim_t{0_n};
+
+      bool result = tensor_dims_has_dim(tensor_dims, dim);
+      bool correct = false;
+
+      CHECK(result == correct);
+    }
+  }
+
   TEST_CASE("tensor_dims_is_broadcastable_to(TensorDims, TensorDims)") {
 
     TensorDims goal = TensorDims{FFOrdered{1_p, 1_p, 4_p, 3_p}};
@@ -62,6 +115,39 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
   }
 
+  TEST_CASE("get_tensor_dims_coord_set") {
+    SUBCASE("TensorDims is not empty") {
+      TensorDims input = TensorDims{
+          FFOrdered{3_p, 1_p, 2_p},
+      };
+
+      std::unordered_set<TensorDimsCoord> result =
+          get_tensor_dims_coord_set(input);
+      std::unordered_set<TensorDimsCoord> correct = {
+          TensorDimsCoord{FFOrdered{0_n, 0_n, 0_n}},
+          TensorDimsCoord{FFOrdered{0_n, 0_n, 1_n}},
+          TensorDimsCoord{FFOrdered{1_n, 0_n, 0_n}},
+          TensorDimsCoord{FFOrdered{1_n, 0_n, 1_n}},
+          TensorDimsCoord{FFOrdered{2_n, 0_n, 0_n}},
+          TensorDimsCoord{FFOrdered{2_n, 0_n, 1_n}},
+      };
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("TensorDims is zero-dimensional") {
+      TensorDims input = TensorDims{FFOrdered<positive_int>{}};
+
+      std::unordered_set<TensorDimsCoord> result =
+          get_tensor_dims_coord_set(input);
+      std::unordered_set<TensorDimsCoord> correct = {
+          TensorDimsCoord{FFOrdered<nonnegative_int>{}},
+      };
+
+      CHECK(result == correct);
+    }
+  }
+
   TEST_CASE("get_broadcast_target_dims(std::unordered_set<TensorDims>)") {
     TensorDims d1 = TensorDims{FFOrdered{1_p, 10_p, 4_p, 3_p}};
 
@@ -119,4 +205,47 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(result == correct);
     }
   }
+
+  TEST_CASE("tensor_dims_drop_dims") {
+    TensorDims dims = TensorDims{
+        FFOrdered{3_p, 5_p, 1_p, 2_p},
+    };
+
+    SUBCASE("removes dims specified to be dropped") {
+      std::function<bool(ff_dim_t)> should_drop_dim = [](ff_dim_t d) {
+        return d.value % 2_n == 0_n;
+      };
+
+      TensorDims result = tensor_dims_drop_dims(dims, should_drop_dim);
+      TensorDims correct = TensorDims{
+          FFOrdered{5_p, 2_p},
+      };
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE(
+        "is identity function if no dimensions are specified to be dropped") {
+      std::function<bool(ff_dim_t)> should_drop_dim = [](ff_dim_t d) {
+        return false;
+      };
+
+      TensorDims result = tensor_dims_drop_dims(dims, should_drop_dim);
+      TensorDims correct = dims;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE(
+        "returns empty dims if all dimensions are specified to be dropped") {
+      std::function<bool(ff_dim_t)> should_drop_dim = [](ff_dim_t d) {
+        return true;
+      };
+
+      TensorDims result = tensor_dims_drop_dims(dims, should_drop_dim);
+      TensorDims correct = TensorDims{FFOrdered<positive_int>{}};
+
+      CHECK(result == correct);
+    }
+  }
 }
diff --git a/lib/kernels/test/src/kernels/array_coord.cc b/lib/op-attrs/test/src/op-attrs/tensor_dims_coord.cc
similarity index 59%
rename from lib/kernels/test/src/kernels/array_coord.cc
rename to lib/op-attrs/test/src/op-attrs/tensor_dims_coord.cc
index bbb503caf1..bb24bfd059 100644
--- a/lib/kernels/test/src/kernels/array_coord.cc
+++ b/lib/op-attrs/test/src/op-attrs/tensor_dims_coord.cc
@@ -1,11 +1,11 @@
-#include "kernels/array_coord.h"
+#include "op-attrs/tensor_dims_coord.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("array_coord_drop_dims") {
-    ArrayCoord coord = ArrayCoord{
+  TEST_CASE("tensor_dims_coord_drop_dims") {
+    TensorDimsCoord coord = TensorDimsCoord{
         FFOrdered{3_n, 5_n, 0_n, 1_n},
     };
 
@@ -14,8 +14,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         return d.value % 2_n == 0_n;
       };
 
-      ArrayCoord result = array_coord_drop_dims(coord, should_drop_dim);
-      ArrayCoord correct = ArrayCoord{
+      TensorDimsCoord result =
+          tensor_dims_coord_drop_dims(coord, should_drop_dim);
+      TensorDimsCoord correct = TensorDimsCoord{
           FFOrdered{5_n, 1_n},
       };
 
@@ -28,8 +29,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         return false;
       };
 
-      ArrayCoord result = array_coord_drop_dims(coord, should_drop_dim);
-      ArrayCoord correct = coord;
+      TensorDimsCoord result =
+          tensor_dims_coord_drop_dims(coord, should_drop_dim);
+      TensorDimsCoord correct = coord;
 
       CHECK(result == correct);
     }
@@ -40,8 +42,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         return true;
       };
 
-      ArrayCoord result = array_coord_drop_dims(coord, should_drop_dim);
-      ArrayCoord correct = ArrayCoord{FFOrdered<nonnegative_int>{}};
+      TensorDimsCoord result =
+          tensor_dims_coord_drop_dims(coord, should_drop_dim);
+      TensorDimsCoord correct = TensorDimsCoord{FFOrdered<nonnegative_int>{}};
 
       CHECK(result == correct);
     }
diff --git a/lib/pcg/include/pcg/cg_operator_plus_signature.struct.toml b/lib/pcg/include/pcg/cg_operator_plus_signature.struct.toml
new file mode 100644
index 0000000000..f4714a87c8
--- /dev/null
+++ b/lib/pcg/include/pcg/cg_operator_plus_signature.struct.toml
@@ -0,0 +1,23 @@
+namespace = "FlexFlow"
+name = "CGOperatorPlusSignature"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "fmt",
+  "json",
+]
+
+includes = [
+  "op-attrs/computation_graph_op_attrs.dtg.h",
+  "pcg/cg_operator_tensor_shape_signature.dtg.h",
+  "<vector>",
+]
+
+[[fields]]
+name = "op_attrs"
+type = "::FlexFlow::ComputationGraphOpAttrs"
+
+[[fields]]
+name = "tensor_shape_signature"
+type = "::FlexFlow::CGOperatorTensorShapeSignature"
diff --git a/lib/pcg/include/pcg/cg_operator_tensor_shape_signature.h b/lib/pcg/include/pcg/cg_operator_tensor_shape_signature.h
new file mode 100644
index 0000000000..3629aaff43
--- /dev/null
+++ b/lib/pcg/include/pcg/cg_operator_tensor_shape_signature.h
@@ -0,0 +1,20 @@
+#ifndef _FLEXFLOW_LIB_PCG_INCLUDE_PCG_CG_OPERATOR_TENSOR_SHAPE_SIGNATURE_H
+#define _FLEXFLOW_LIB_PCG_INCLUDE_PCG_CG_OPERATOR_TENSOR_SHAPE_SIGNATURE_H
+
+#include "pcg/cg_operator_tensor_shape_signature.dtg.h"
+#include "pcg/tensor_role.dtg.h"
+
+namespace FlexFlow {
+
+std::vector<TensorShape>
+    tensor_shapes_for_role(CGOperatorTensorShapeSignature const &signature,
+                           TensorRole tensor_role);
+
+TensorShape tensor_shape_for_role_and_index(
+    CGOperatorTensorShapeSignature const &signature,
+    TensorRole tensor_role,
+    nonnegative_int index);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/pcg/include/pcg/cg_operator_tensor_shape_signature.struct.toml b/lib/pcg/include/pcg/cg_operator_tensor_shape_signature.struct.toml
new file mode 100644
index 0000000000..a2a6c047c6
--- /dev/null
+++ b/lib/pcg/include/pcg/cg_operator_tensor_shape_signature.struct.toml
@@ -0,0 +1,32 @@
+namespace = "FlexFlow"
+name = "CGOperatorTensorShapeSignature"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "fmt",
+  "json",
+  "rapidcheck",
+]
+
+includes = [
+  "op-attrs/tensor_shape.dtg.h",
+  "<vector>",
+]
+
+src_includes = [
+  "utils/hash/vector.h",
+  "utils/fmt/vector.h",
+]
+
+[[fields]]
+name = "input_shapes"
+type = "std::vector<::FlexFlow::TensorShape>"
+
+[[fields]]
+name = "weight_shapes"
+type = "std::vector<::FlexFlow::TensorShape>"
+
+[[fields]]
+name = "output_shapes"
+type = "std::vector<::FlexFlow::TensorShape>"
diff --git a/lib/pcg/include/pcg/computation_graph_builder.h b/lib/pcg/include/pcg/computation_graph_builder.h
index 2be2a54cd8..d90898716f 100644
--- a/lib/pcg/include/pcg/computation_graph_builder.h
+++ b/lib/pcg/include/pcg/computation_graph_builder.h
@@ -145,7 +145,7 @@ struct ComputationGraphBuilder {
       std::optional<std::string> const &name = std::nullopt);
   tensor_guid_t
       layer_norm(tensor_guid_t const &input,
-                 std::vector<relative_ff_dim_t> const &axes,
+                 std::set<relative_ff_dim_t> const &axes,
                  bool elementwise_affine,
                  float eps,
                  std::optional<std::string> const &name = std::nullopt);
diff --git a/lib/pcg/include/pcg/file_format/v1/data_type_value.h b/lib/pcg/include/pcg/file_format/v1/data_type_value.h
index ec3910aab3..dae0ccb368 100644
--- a/lib/pcg/include/pcg/file_format/v1/data_type_value.h
+++ b/lib/pcg/include/pcg/file_format/v1/data_type_value.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_PCG_INCLUDE_PCG_FILE_FORMAT_V1_DATA_TYPE_H
 #define _FLEXFLOW_PCG_INCLUDE_PCG_FILE_FORMAT_V1_DATA_TYPE_H
 
-#include "utils/fp16.h"
+#include "utils/half.h"
 #include <nlohmann/json.hpp>
 
 namespace FlexFlow {
diff --git a/lib/pcg/include/pcg/optimizer_attrs.h b/lib/pcg/include/pcg/optimizer_attrs.h
index 51dd92c23a..5d9ea8d112 100644
--- a/lib/pcg/include/pcg/optimizer_attrs.h
+++ b/lib/pcg/include/pcg/optimizer_attrs.h
@@ -2,11 +2,12 @@
 #define _FLEXFLOW_PCG_OPTIMIZER_ATTRS_H
 
 #include "pcg/optimizer_attrs.dtg.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 
 namespace FlexFlow {
 
 OptimizerAttrs get_optimizer_attrs_for_next_iter(OptimizerAttrs const &old);
-int get_num_optimizer_tensors(OptimizerAttrs const &);
+nonnegative_int get_num_optimizer_tensors(OptimizerAttrs const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/pcg/include/pcg/pcg_operator_plus_signature.struct.toml b/lib/pcg/include/pcg/pcg_operator_plus_signature.struct.toml
new file mode 100644
index 0000000000..e827dae891
--- /dev/null
+++ b/lib/pcg/include/pcg/pcg_operator_plus_signature.struct.toml
@@ -0,0 +1,23 @@
+namespace = "FlexFlow"
+name = "PCGOperatorPlusSignature"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "fmt",
+  "json",
+]
+
+includes = [
+  "op-attrs/pcg_operator_attrs.dtg.h",
+  "pcg/pcg_operator_tensor_shape_signature.dtg.h",
+  "<vector>",
+]
+
+[[fields]]
+name = "op_attrs"
+type = "::FlexFlow::PCGOperatorAttrs"
+
+[[fields]]
+name = "tensor_shape_signature"
+type = "::FlexFlow::PCGOperatorTensorShapeSignature"
diff --git a/lib/pcg/include/pcg/pcg_operator_tensor_shape_signature.struct.toml b/lib/pcg/include/pcg/pcg_operator_tensor_shape_signature.struct.toml
new file mode 100644
index 0000000000..3e99bdde64
--- /dev/null
+++ b/lib/pcg/include/pcg/pcg_operator_tensor_shape_signature.struct.toml
@@ -0,0 +1,31 @@
+namespace = "FlexFlow"
+name = "PCGOperatorTensorShapeSignature"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "fmt",
+  "json",
+]
+
+includes = [
+  "op-attrs/parallel_tensor_shape.dtg.h",
+  "<vector>",
+]
+
+src_includes = [
+  "utils/hash/vector.h",
+  "utils/fmt/vector.h",
+]
+
+[[fields]]
+name = "input_shapes"
+type = "std::vector<::FlexFlow::ParallelTensorShape>"
+
+[[fields]]
+name = "weight_shapes"
+type = "std::vector<::FlexFlow::ParallelTensorShape>"
+
+[[fields]]
+name = "output_shapes"
+type = "std::vector<::FlexFlow::ParallelTensorShape>"
diff --git a/lib/task-spec/include/task-spec/tensor_role.enum.toml b/lib/pcg/include/pcg/tensor_role.enum.toml
similarity index 100%
rename from lib/task-spec/include/task-spec/tensor_role.enum.toml
rename to lib/pcg/include/pcg/tensor_role.enum.toml
diff --git a/lib/pcg/src/pcg/cg_operator_tensor_shape_signature.cc b/lib/pcg/src/pcg/cg_operator_tensor_shape_signature.cc
new file mode 100644
index 0000000000..90ffb85c9b
--- /dev/null
+++ b/lib/pcg/src/pcg/cg_operator_tensor_shape_signature.cc
@@ -0,0 +1,28 @@
+#include "pcg/cg_operator_tensor_shape_signature.h"
+
+namespace FlexFlow {
+
+std::vector<TensorShape>
+    tensor_shapes_for_role(CGOperatorTensorShapeSignature const &signature,
+                           TensorRole tensor_role) {
+  switch (tensor_role) {
+    case TensorRole::INPUT:
+      return signature.input_shapes;
+    case TensorRole::WEIGHT:
+      return signature.weight_shapes;
+    case TensorRole::OUTPUT:
+      return signature.output_shapes;
+    default:
+      PANIC("Unhandled tensor role", tensor_role);
+  };
+}
+
+TensorShape tensor_shape_for_role_and_index(
+    CGOperatorTensorShapeSignature const &signature,
+    TensorRole tensor_role,
+    nonnegative_int index) {
+  return tensor_shapes_for_role(signature, tensor_role)
+      .at(index.unwrap_nonnegative());
+}
+
+} // namespace FlexFlow
diff --git a/lib/pcg/src/pcg/computation_graph_builder.cc b/lib/pcg/src/pcg/computation_graph_builder.cc
index 0a24acc6aa..4feefa713e 100644
--- a/lib/pcg/src/pcg/computation_graph_builder.cc
+++ b/lib/pcg/src/pcg/computation_graph_builder.cc
@@ -41,6 +41,7 @@
 #include "utils/containers/without_nullopts.h"
 #include "utils/containers/zip_with_strict.h"
 #include "utils/expected.h"
+#include "utils/fmt/set.h"
 #include "utils/stack_vector/stack_vector_of.h"
 #include <fmt/format.h>
 
@@ -480,8 +481,8 @@ tensor_guid_t ComputationGraphBuilder::gather(
                     DataType::INT64));
   }
 
-  GatherAttrs attrs = GatherAttrs{
-      ff_dim_t_from_relative_ff_dim_t(dim, num_dims(this->get_shape(input)))};
+  GatherAttrs attrs = GatherAttrs{ff_dim_t_from_relative_ff_dim_t(
+      dim, get_num_dims(this->get_shape(input).dims))};
   std::string name =
       maybe_name.value_or(get_default_name(ComputationGraphOpAttrs{attrs}));
 
@@ -702,7 +703,7 @@ tensor_guid_t ComputationGraphBuilder::concat(
     std::optional<std::string> const &maybe_name) {
 
   ff_dim_t abs_axis = ff_dim_t_from_relative_ff_dim_t(
-      axis, num_dims(this->get_shape(inputs.at(0))));
+      axis, get_num_dims(this->get_shape(inputs.at(0)).dims));
 
   ConcatAttrs attrs = ConcatAttrs{abs_axis};
 
@@ -719,7 +720,7 @@ tensor_guid_t ComputationGraphBuilder::flat(
     relative_ff_dim_t start_dim,
     std::optional<relative_ff_dim_t> const &end_dim,
     std::optional<std::string> const &maybe_name) {
-  nonnegative_int input_num_dims = num_dims(this->get_shape(input));
+  nonnegative_int input_num_dims = get_num_dims(this->get_shape(input).dims);
 
   ff_dim_t abs_start_dim =
       ff_dim_t_from_relative_ff_dim_t(start_dim, input_num_dims);
@@ -743,7 +744,7 @@ tensor_guid_t ComputationGraphBuilder::flat(
 
 tensor_guid_t ComputationGraphBuilder::layer_norm(
     tensor_guid_t const &input,
-    std::vector<relative_ff_dim_t> const &relative_axes,
+    std::set<relative_ff_dim_t> const &relative_axes,
     bool elementwise_affine,
     float eps,
     std::optional<std::string> const &maybe_name) {
@@ -751,26 +752,26 @@ tensor_guid_t ComputationGraphBuilder::layer_norm(
   TensorShape input_shape = this->get_shape(input);
 
   auto resolve_dim_idx = [&](relative_ff_dim_t dim_idx) {
-    return ff_dim_t_from_relative_ff_dim_t(dim_idx, num_dims(input_shape));
+    return ff_dim_t_from_relative_ff_dim_t(dim_idx,
+                                           get_num_dims(input_shape.dims));
   };
 
-  stack_vector<ff_dim_t, MAX_TENSOR_DIM> axes = stack_vector_of<MAX_TENSOR_DIM>(
-      transform(relative_axes, resolve_dim_idx));
+  std::set<ff_dim_t> axes = transform(relative_axes, resolve_dim_idx);
 
   if (any_of(axes, [&](ff_dim_t axis) {
-        return axis.value >= num_dims(input_shape);
+        return axis.value >= get_num_dims(input_shape.dims);
       })) {
     throw mk_runtime_error(fmt::format(
         "ComputationGraphBuilder::layer_norm received axes {} with "
         "out-of-bound element (input tensor has num dimensions = {})",
         axes,
-        num_dims(input_shape)));
+        get_num_dims(input_shape.dims)));
   }
 
   LayerNormAttrs attrs = LayerNormAttrs{
-      axes,
-      elementwise_affine,
-      eps,
+      /*axes=*/axes,
+      /*elementwise_affine=*/elementwise_affine,
+      /*eps=*/eps,
   };
 
   std::string name =
@@ -790,19 +791,16 @@ tensor_guid_t ComputationGraphBuilder::softmax(
 
   TensorShape input_shape = this->get_shape(input);
 
-  relative_ff_dim_t dim = maybe_dim.value_or(
-      relative_ff_dim_t{num_dims(input_shape).unwrap_nonnegative() - 1});
+  relative_ff_dim_t dim = maybe_dim.value_or(relative_ff_dim_t{
+      get_num_dims(input_shape.dims).unwrap_nonnegative() - 1});
 
-  SoftmaxAttrs attrs =
-      SoftmaxAttrs{ff_dim_t_from_relative_ff_dim_t(dim, num_dims(input_shape))};
+  SoftmaxAttrs attrs = SoftmaxAttrs{
+      ff_dim_t_from_relative_ff_dim_t(dim, get_num_dims(input_shape.dims))};
 
-  if (attrs.dim.value >= num_dims(input_shape)) {
-    throw mk_runtime_error(
-        fmt::format("ComputationGraphBuilder::softmax received out-of-bounds "
-                    "dim {} for input tensor shape {}",
-                    attrs.dim.value,
-                    input_shape));
-  }
+  ASSERT(attrs.dim.value < get_num_dims(input_shape.dims),
+         "ComputationGraphBuilder::softmax received out_of_bounds dim",
+         attrs.dim,
+         input_shape);
 
   std::string name =
       maybe_name.value_or(get_default_name(ComputationGraphOpAttrs{attrs}));
diff --git a/lib/pcg/src/pcg/optimizer_attrs.cc b/lib/pcg/src/pcg/optimizer_attrs.cc
index 7a37091428..b99fcd600b 100644
--- a/lib/pcg/src/pcg/optimizer_attrs.cc
+++ b/lib/pcg/src/pcg/optimizer_attrs.cc
@@ -23,16 +23,16 @@ OptimizerAttrs
   }
 }
 
-int get_num_optimizer_tensors(OptimizerAttrs const &attrs) {
-  return attrs.visit<int>(
+nonnegative_int get_num_optimizer_tensors(OptimizerAttrs const &attrs) {
+  return attrs.visit<nonnegative_int>(
       overload{[&](SGDOptimizerAttrs const &o) {
                  if (o.momentum > 0.0f) {
-                   return 1;
+                   return 1_n;
                  } else {
-                   return 0;
+                   return 0_n;
                  }
                },
-               [&](AdamOptimizerAttrs const &) { return 2; }});
+               [&](AdamOptimizerAttrs const &) { return 2_n; }});
 }
 
 } // namespace FlexFlow
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
index b08c0a575d..052d30df0f 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
@@ -64,13 +64,8 @@ ParallelLayerAddedResult add_parallel_layer(
   std::vector<ParallelTensorShape> correct_weight_shapes =
       get_weight_shapes(layer_attrs.op_attrs, input_shapes);
 
-  if (weight_shapes != correct_weight_shapes) {
-    throw mk_runtime_error(
-        fmt::format("add_parallel_layer expected weight shapes {}, but "
-                    "received weights with shapes {}",
-                    correct_weight_shapes,
-                    weight_shapes));
-  }
+  ASSERT(weight_shapes == correct_weight_shapes,
+         "add_parallel_layer received incorrect weight shapes");
 
   std::vector<ParallelTensorShape> output_shapes =
       get_output_shapes(layer_attrs.op_attrs, input_shapes);
diff --git a/lib/realm-backend/CMakeLists.txt b/lib/realm-backend/CMakeLists.txt
index 623816567e..a325e14955 100644
--- a/lib/realm-backend/CMakeLists.txt
+++ b/lib/realm-backend/CMakeLists.txt
@@ -11,6 +11,7 @@ ff_add_library(
     op-attrs
     utils
     kernels
+    compiler
     local-execution
     pcg
     spdlog
diff --git a/lib/realm-backend/include/realm-backend/model_training_instance.h b/lib/realm-backend/include/realm-backend/model_training_instance.h
index b1580b0305..e95b4c81ea 100644
--- a/lib/realm-backend/include/realm-backend/model_training_instance.h
+++ b/lib/realm-backend/include/realm-backend/model_training_instance.h
@@ -4,29 +4,24 @@
 #include "realm-backend/realm_training_backing.h"
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
 #include "pcg/tensor_guid_t.dtg.h"
-#include "task-spec/loss_tensor_t.dtg.h"
+#include "task-spec/loss_tensor_guid_t.dtg.h"
 
 namespace FlexFlow {
 
-using PerLayerElapsedTime =
-    std::unordered_map<layer_guid_t, std::optional<float>>;
-
 struct ModelTrainingInstance {
-  ModelTrainingInstance(RealmTrainingBacking const &,
-                        tensor_guid_t const &logit_tensor,
-                        loss_tensor_t const &label_tensor,
+  ModelTrainingInstance(RealmRuntimeState &,
+                        LocalTrainingBacking const &,
                         LossAttrs const &,
                         OptimizerAttrs const &);
 
-  RealmTrainingBacking training_backing;
-  tensor_guid_t logit_tensor;
-  loss_tensor_t label_tensor;
+  RealmRuntimeState &runtime_state;
+  LocalTrainingBacking training_backing;
   LossAttrs loss_attrs;
   OptimizerAttrs optimizer_attrs;
 
 public:
-  PerLayerElapsedTime forward();
-  PerLayerElapsedTime backward();
+  std::unordered_map<layer_guid_t, std::optional<milliseconds_t>> forward();
+  std::unordered_map<layer_guid_t, std::optional<milliseconds_t>> backward();
   void update();
   GenericTensorAccessorR get_loss_tensor_accessor() const;
 };
diff --git a/lib/realm-backend/include/realm-backend/realm_args_backing.h b/lib/realm-backend/include/realm-backend/realm_args_backing.h
deleted file mode 100644
index 75f954c0ad..0000000000
--- a/lib/realm-backend/include/realm-backend/realm_args_backing.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef _FLEXFLOW_REALM_BACKEND_REALM_ARGS_BACKING_H
-#define _FLEXFLOW_REALM_BACKEND_REALM_ARGS_BACKING_H
-
-#include "pcg/computation_graph.h"
-#include "pcg/layer_guid_t.dtg.h"
-#include "realm-backend/realm_task_argument_accessor.h"
-#include "realm-backend/task_result.h"
-#include "task-spec/op_task_invocation.h"
-#include "task-spec/per_device_op_state.h"
-#include "task-spec/runtime_arg_config.h"
-#include "task-spec/task_invocation.dtg.h"
-
-namespace FlexFlow {
-
-struct RealmArgsBacking {
-  RealmArgsBacking(RuntimeArgConfig const &,
-     std::unordered_map<layer_guid_t, DeviceSpecificDeviceStates> const &);
-
-public:
-  // arguments
-  RuntimeArgConfig runtime_arg_config;
-  std::unordered_map<layer_guid_t, DeviceSpecificDeviceStates>
-      per_device_op_states;
-};
-
-RealmArgsBacking
-make_args_backing_with_empty_device_states(RuntimeArgConfig const &);
-
-std::optional<DeviceSpecificDeviceStates>
-get_per_device_op_state_if_exists(RealmArgsBacking const &,
-                                  layer_guid_t const &);
-
-ArgSlotsBacking construct_arg_slots_backing(TaskBinding const &,
-                                            RuntimeArgConfig const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h b/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h
deleted file mode 100644
index 0e83a3de6f..0000000000
--- a/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h
+++ /dev/null
@@ -1,47 +0,0 @@
-#ifndef _FLEXFLOW_REALM_BACKEND_REALM_TASK_ARGUMENT_ACCESSOR_H
-#define _FLEXFLOW_REALM_BACKEND_REALM_TASK_ARGUMENT_ACCESSOR_H
-
-#include "realm-backend/realm_allocator.h"
-#include "task-spec/slot_tensor_type_id.dtg.h"
-#include "task-spec/task_argument_accessor.h"
-#include <unordered_map>
-#include <variant>
-
-namespace FlexFlow {
-
-using TensorSlotsBacking = std::unordered_map<
-    SlotTensorTypeId,
-    std::variant<GenericTensorAccessorW, std::vector<GenericTensorAccessorW>>>;
-using ArgSlotsBacking = std::unordered_map<slot_id_t, ConcreteArgSpec>;
-
-struct RealmTaskArgumentAccessor : public ITaskArgumentAccessor {
-  RealmTaskArgumentAccessor(Allocator const &allocator,
-                            TensorSlotsBacking const &tensor_slots_backing,
-                            ArgSlotsBacking const &arg_slots_backing);
-
-  RealmTaskArgumentAccessor(RealmTaskArgumentAccessor const &) = delete;
-  RealmTaskArgumentAccessor(RealmTaskArgumentAccessor &&) = delete;
-
-  ConcreteArgSpec const &get_concrete_arg(slot_id_t) const override;
-
-  GenericTensorAccessor get_tensor(slot_id_t slot, Permissions priv,
-                                   TensorType tensor_type) const override;
-  VariadicGenericTensorAccessor
-  get_variadic_tensor(slot_id_t slot, Permissions priv,
-                      TensorType tensor_type) const override;
-
-  Allocator get_allocator() const override;
-
-  size_t get_device_idx() const override;
-
-private:
-  Allocator allocator;
-  TensorSlotsBacking tensor_slots_backing;
-  ArgSlotsBacking arg_slots_backing;
-};
-
-CHECK_RC_COPY_VIRTUAL_COMPLIANT(RealmTaskArgumentAccessor);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/realm-backend/include/realm-backend/realm_tensor_backing.h b/lib/realm-backend/include/realm-backend/realm_tensor_backing.h
deleted file mode 100644
index b38815ffee..0000000000
--- a/lib/realm-backend/include/realm-backend/realm_tensor_backing.h
+++ /dev/null
@@ -1,47 +0,0 @@
-
-#ifndef _FLEXFLOW_REALM_BACKEND_REALM_TENSOR_BACKING_H
-#define _FLEXFLOW_REALM_BACKEND_REALM_TENSOR_BACKING_H
-
-#include "kernels/accessor.h"
-#include "local-execution/allocated_tensors.dtg.h"
-#include "local-execution/gradient_tensor_source.h"
-#include "local-execution/loss_tensor_source.h"
-#include "local-execution/optimizer_tensor_source.h"
-#include "local-execution/unallocated_tensors.dtg.h"
-#include "pcg/computation_graph.dtg.h"
-#include "pcg/layer_guid_t.dtg.h"
-#include "pcg/optimizer_attrs.dtg.h"
-#include "realm-backend/realm_allocator.h"
-#include "realm-backend/realm_task_argument_accessor.h"
-#include "realm-backend/realm_tensor_backing.dtg.h"
-#include "task-spec/lowered_tensor_t.dtg.h"
-#include "task-spec/task_invocation.dtg.h"
-#include "task-spec/tensor_role.dtg.h"
-namespace FlexFlow {
-
-  GenericTensorAccessorW get_tensor(RealmTensorBacking const &,
-                                    TensorTypeVariant const &);
-  
-  std::unordered_map<TensorTypeVariant, GenericTensorAccessorW>
-      get_tensor_backings(
-          std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> const &,
-          std::unordered_map<TensorTypeVariant, TensorShape> const &,
-          Allocator &);
-  
-  std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
-      merge_optimizer_mappings(
-          std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>> const
-              &allocated,
-          std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>> const
-              &unallocated);
-  
-  RealmTensorBacking construct_realm_tensor_backing(AllocatedTensors const &,
-                                                    UnallocatedTensors const &,
-                                                    Allocator &);
-  
-  TensorSlotsBacking construct_tensor_slots_backing(RealmTensorBacking const &,
-                                                    TaskBinding const &);
-  
-  } // namespace FlexFlow
-  
-  #endif
\ No newline at end of file
diff --git a/lib/realm-backend/include/realm-backend/realm_tensor_backing.struct.toml b/lib/realm-backend/include/realm-backend/realm_tensor_backing.struct.toml
deleted file mode 100644
index d53071dd0e..0000000000
--- a/lib/realm-backend/include/realm-backend/realm_tensor_backing.struct.toml
+++ /dev/null
@@ -1,31 +0,0 @@
-namespace = "FlexFlow"
-name = "RealmTensorBacking"
-features = [
-  "eq",
-  "fmt",
-]
-
-includes = [
-  "task-spec/tensor_type_t.dtg.h",
-  "kernels/accessor.h",
-  "pcg/tensor_guid_t.dtg.h",
-  "task-spec/gradient_tensor_t.dtg.h",
-  "task-spec/optimizer_tensor_t.dtg.h",
-]
-
-src_includes = [
-  "utils/fmt/unordered_map.h",
-  "utils/fmt/vector.h",
-]
-
-[[fields]]
-name = "tensor_backings"
-type = "std::unordered_map<::FlexFlow::TensorTypeVariant, ::FlexFlow::GenericTensorAccessorW>"
-
-[[fields]]
-name = "tensor_gradient_mapping"
-type = "std::unordered_map<::FlexFlow::tensor_guid_t, ::FlexFlow::gradient_tensor_t>"
-
-[[fields]]
-name = "tensor_optimizer_mapping"
-type = "std::unordered_map<::FlexFlow::tensor_guid_t, std::vector<::FlexFlow::optimizer_tensor_t>>"
\ No newline at end of file
diff --git a/lib/realm-backend/include/realm-backend/realm_training_backing.h b/lib/realm-backend/include/realm-backend/realm_training_backing.h
index 8fe842daf6..57fc7147ce 100644
--- a/lib/realm-backend/include/realm-backend/realm_training_backing.h
+++ b/lib/realm-backend/include/realm-backend/realm_training_backing.h
@@ -1,79 +1,63 @@
 #ifndef _FLEXFLOW_REALM_BACKEND_REALM_TRAINING_BACKING_H
 #define _FLEXFLOW_REALM_BACKEND_REALM_TRAINING_BACKING_H
 
-#include "local-execution/optimizer_tensor_source.h"
-#include "local-execution/task_registry.h"
+#include "local-execution/local_training_backing.dtg.h"
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
-#include "pcg/computation_graph.dtg.h"
 #include "pcg/optimizer_attrs.dtg.h"
-#include "local-execution/allocated_tensors.h"
-#include "local-execution/unallocated_tensors.h"
+#include "task-spec/training_computation_graph.dtg.h"
+#include "task-spec/training_tensor_guid_t.dtg.h"
+#include "utils/containers/generate_map.h"
+#include "utils/units/milliseconds_t.h"
 #include "realm-backend/driver.h"
 #include "realm-backend/realm_allocator.h"
-#include "realm-backend/realm_args_backing.h"
-#include "realm-backend/realm_tensor_backing.h"
 #include "realm-backend/task_wrapper.h"
 
 namespace FlexFlow {
 
-using PerLayerElapsedTime =
-    std::unordered_map<layer_guid_t, std::optional<float>>;
-
-struct RealmTrainingBacking {
-  RealmTrainingBacking(Realm::Processor, 
-    std::vector<Realm::Processor> const &, 
-    std::vector<Allocator> const &,
-                      AllocatedTensors const &,
-                      GradientTensorSource &,
-                       ComputationGraph const &, RuntimeArgConfig const &);
-
-  RealmTrainingBacking(Realm::Processor, 
-    std::vector<Realm::Processor> const &, 
-    std::vector<Allocator> const &,
-    AllocatedTensors const &,
-    GradientTensorSource &,
-    OptimizerTensorSource &,
-                       ComputationGraph const &, RuntimeArgConfig const &,
-                       OptimizerAttrs const &);
-
-public:
-  // runtime
+struct RealmRuntimeState {
   Realm::Processor master_proc;
   Realm::Event master_event;
   Realm::Memory master_mem;
   std::vector<Realm::Processor> worker_procs;
   std::vector<Realm::Event> worker_events;
   std::vector<Allocator> allocators;
+};
 
-  ComputationGraph computation_graph;
-  TaskRegistry task_registry;
+LocalTrainingBacking make_realm_training_backing_for_computation_graph(
+    RealmRuntimeState &runtime_state,
+    std::unordered_map<training_tensor_guid_t, GenericTensorAccessorW> const
+        &preallocated_tensors,
+    TrainingComputationGraph const &training_computation_graph,
+    RuntimeArgConfig const &runtime_arg_config,
+    OptimizerAttrs const &optimizer_attrs);
 
-  RealmTensorBacking realm_tensor_backing;
-  RealmArgsBacking realm_args_backing;
-};
+void register_tasks_for_realm(LocalTaskRegistry const &, RealmRuntimeState &);
+
+std::optional<DeviceSpecificDeviceStates>
+    create_per_device_op_state(LocalTaskRegistry const &,
+                               LocalTensorBacking const &,
+                               RuntimeArgConfig const &,
+                               RealmRuntimeState &,
+                               TrainingLayerPlusContext const &);
 
-TaskRegistry construct_task_registry_and_register_tasks_for_realm(
-    ComputationGraph const &, std::vector<Realm::Processor> const &);
+Future<std::optional<milliseconds_t>> execute_forward(LocalTaskRegistry const &,
+                                              LocalTensorBacking const &,
+                                              LocalArgsBacking const &,
+                                              TrainingLayerPlusContext const &,
+                                              RealmRuntimeState &);
 
-RealmArgsBacking initialize_args_backing(RealmTrainingBacking *,
-                                        ComputationGraph const &,
-                                        RuntimeArgConfig const &);
+Future<std::optional<milliseconds_t>> execute_backward(LocalTaskRegistry const &,
+                                               LocalTensorBacking const &,
+                                               LocalArgsBacking const &,
+                                               TrainingLayerPlusContext const &,
+                                               RealmRuntimeState &);
 
-void execute_init(RealmTrainingBacking &, layer_guid_t const &);
-Future<float> execute_forward(RealmTrainingBacking &,
-                              layer_guid_t const &);
-Future<float> execute_backward(RealmTrainingBacking &,
-                              layer_guid_t const &);
-Future<void> compute_loss(RealmTrainingBacking &, LossAttrs const &,
-                          tensor_guid_t const &logit_tensor,
-                          loss_tensor_t const &label_tensor);
-Future<void> execute_update(RealmTrainingBacking &, layer_guid_t const &,
-                            OptimizerAttrs const &);
+Future<void> compute_loss(LocalTrainingBacking const &, LossAttrs const &, RealmRuntimeState &);
 
-TaskArgumentAccessor get_task_arg_accessor(RealmTensorBacking const &,
-                                           RealmArgsBacking const &,
-                                           TaskInvocation const &,
-                                           Allocator &);
+Future<void> execute_update(LocalTrainingBacking const &,
+                    layer_guid_t const &,
+                    OptimizerAttrs const &,
+                    RealmRuntimeState &);
 
 } // namespace FlexFlow
 
diff --git a/lib/realm-backend/include/realm-backend/task_result.h b/lib/realm-backend/include/realm-backend/task_result.h
index d869982563..46e5f89274 100644
--- a/lib/realm-backend/include/realm-backend/task_result.h
+++ b/lib/realm-backend/include/realm-backend/task_result.h
@@ -2,7 +2,6 @@
 #define _FLEXFLOW_LOCAL_EXECUTION_TASK_RESULT_H
 
 #include "realm-backend/driver.h"
-#include "realm-backend/realm_task_argument_accessor.h"
 #include <cassert>
 #include <optional>
 
diff --git a/lib/realm-backend/include/realm-backend/task_wrapper.h b/lib/realm-backend/include/realm-backend/task_wrapper.h
index 64a360e549..fa6c9f0ed3 100644
--- a/lib/realm-backend/include/realm-backend/task_wrapper.h
+++ b/lib/realm-backend/include/realm-backend/task_wrapper.h
@@ -1,8 +1,7 @@
 #ifndef _FLEXFLOW_REALM_BACKEND_TASK_WRAPPER_H
 #define _FLEXFLOW_REALM_BACKEND_TASK_WRAPPER_H
 
-#include "local-execution/task_registry.h"
-#include "realm-backend/realm_task_argument_accessor.h"
+#include "local-execution/local_task_registry.h"
 #include "realm-backend/task_result.h"
 
 namespace FlexFlow {
diff --git a/lib/realm-backend/src/model_training_instance.cc b/lib/realm-backend/src/model_training_instance.cc
index 87b8121bd5..a7d359b638 100644
--- a/lib/realm-backend/src/model_training_instance.cc
+++ b/lib/realm-backend/src/model_training_instance.cc
@@ -1,98 +1,114 @@
 #include "pcg/computation_graph.h"
 #include "pcg/optimizer_attrs.h"
 #include "realm-backend/model_training_instance.h"
-#include "kernels/format_accessor_contents.h"
+#include "task-spec/training_computation_graph.h"
 #include "utils/containers/reversed.h"
 
 namespace FlexFlow {
 
   ModelTrainingInstance::ModelTrainingInstance(
-    RealmTrainingBacking const &realm_training_backing,
-    tensor_guid_t const &logit_tensor,
-    loss_tensor_t const &label_tensor,
+    RealmRuntimeState &runtime_state,
+    LocalTrainingBacking const &local_training_backing,
     LossAttrs const &loss_attrs,
     OptimizerAttrs const &optimizer_attrs)
-    : training_backing(realm_training_backing), loss_attrs(loss_attrs),
-      optimizer_attrs(optimizer_attrs), logit_tensor(logit_tensor),
-      label_tensor(label_tensor){};
+    : runtime_state(runtime_state), training_backing(local_training_backing), 
+      loss_attrs(loss_attrs), optimizer_attrs(optimizer_attrs) {}
 
-PerLayerElapsedTime ModelTrainingInstance::forward() {
-  PerLayerElapsedTime per_layer_elapsed_time;
-  std::unordered_map<layer_guid_t, Future<float>>
+std::unordered_map<layer_guid_t, std::optional<milliseconds_t>>
+    ModelTrainingInstance::forward() {
+
+  std::unordered_map<layer_guid_t, std::optional<milliseconds_t>>
+      per_layer_elapsed_time;
+    std::unordered_map<layer_guid_t, Future<std::optional<milliseconds_t>>>
       per_layer_elapsed_time_future;
-  for (layer_guid_t const &node : topological_ordering(
-           this->training_backing.computation_graph)) {
+
+  for (layer_guid_t const &layer_guid :
+       topological_ordering(this->training_backing.training_computation_graph
+                                .computation_graph)) {
     per_layer_elapsed_time_future.insert(
-        {node, execute_forward(this->training_backing, node)});
+        {layer_guid, 
+         execute_forward(
+          this->training_backing.local_task_registry,
+          this->training_backing.local_tensor_backing,
+          this->training_backing.local_args_backing,
+          get_training_layer_plus_context(
+              this->training_backing.training_computation_graph, layer_guid),
+          this->runtime_state)
+        });
   }
-  for (layer_guid_t const &node : topological_ordering(
-           this->training_backing.computation_graph)) {
-    float elapsed_time =
-        per_layer_elapsed_time_future[node].get();
-    per_layer_elapsed_time.insert({node, elapsed_time});
+
+  for (layer_guid_t const &layer_guid : topological_ordering(
+           this->training_backing.training_computation_graph
+               .computation_graph)) {
+    std::optional<milliseconds_t> elapsed_time =
+        per_layer_elapsed_time_future[layer_guid].get();
+    per_layer_elapsed_time.insert({layer_guid, elapsed_time});
   }
   return per_layer_elapsed_time;
 }
 
-PerLayerElapsedTime ModelTrainingInstance::backward() {
-  compute_loss(this->training_backing,
-               this->loss_attrs,
-               this->logit_tensor,
-               this->label_tensor);
-
-  gradient_tensor_t loss_tensor =
-      this->training_backing.realm_tensor_backing.tensor_gradient_mapping.at(
-          this->logit_tensor);
-  GenericTensorAccessorW loss_tensor_backing =
-      this->training_backing.realm_tensor_backing.tensor_backings.at(
-          TensorTypeVariant{loss_tensor});
+std::unordered_map<layer_guid_t, std::optional<milliseconds_t>>
+    ModelTrainingInstance::backward() {
+  compute_loss(this->training_backing, this->loss_attrs, this->runtime_state);
 
-  PerLayerElapsedTime per_layer_elapsed_time;
-  std::unordered_map<layer_guid_t, Future<float>>
+  std::unordered_map<layer_guid_t, std::optional<milliseconds_t>>
+      per_layer_elapsed_time;
+  std::unordered_map<layer_guid_t, Future<std::optional<milliseconds_t>>>
       per_layer_elapsed_time_future;
-  for (layer_guid_t const &node : reversed(topological_ordering(
-           this->training_backing.computation_graph))) {
+
+  for (layer_guid_t const &layer_guid : reversed(topological_ordering(
+           this->training_backing.training_computation_graph
+               .computation_graph))) {
     per_layer_elapsed_time_future.insert(
-        {node, execute_backward(this->training_backing, node)});
+        {layer_guid, 
+         execute_backward(
+          this->training_backing.local_task_registry,
+          this->training_backing.local_tensor_backing,
+          this->training_backing.local_args_backing,
+          get_training_layer_plus_context(
+              this->training_backing.training_computation_graph, layer_guid),
+          this->runtime_state)
+        });
   }
-  for (layer_guid_t const &node : reversed(topological_ordering(
-           this->training_backing.computation_graph))) {
-    float elapsed_time =
-        per_layer_elapsed_time_future[node].get();
-    per_layer_elapsed_time.insert({node, elapsed_time});
+  
+  for (layer_guid_t const &layer_guid : reversed(topological_ordering(
+           this->training_backing.training_computation_graph
+               .computation_graph))) {
+    std::optional<milliseconds_t> elapsed_time =
+        per_layer_elapsed_time_future[layer_guid].get();
+    per_layer_elapsed_time.insert({layer_guid, elapsed_time});
   }
   return per_layer_elapsed_time;
 }
 
 void ModelTrainingInstance::update() {
   std::unordered_map<layer_guid_t, Future<void>> per_layer_update_future;
-  for (layer_guid_t const &node : topological_ordering(
-           this->training_backing.computation_graph)) {
+  for (layer_guid_t const &layer_guid : topological_ordering(
+           this->training_backing.training_computation_graph
+               .computation_graph)) {
     per_layer_update_future.insert(
-        {node, execute_update(this->training_backing,
-                   node,
-                   this->optimizer_attrs)});
+        {layer_guid, execute_update(this->training_backing,
+                   layer_guid,
+                   this->optimizer_attrs,
+                   this->runtime_state)});
   }
-  for (layer_guid_t const &node : topological_ordering(
-           this->training_backing.computation_graph)) {
-    per_layer_update_future[node].wait();
+  for (layer_guid_t const &layer_guid : topological_ordering(
+           this->training_backing.training_computation_graph
+               .computation_graph)) {
+    per_layer_update_future[layer_guid].wait();
   }
   this->optimizer_attrs = get_optimizer_attrs_for_next_iter(
     this->optimizer_attrs);
 }
 
 GenericTensorAccessorR ModelTrainingInstance::get_loss_tensor_accessor() const {
-  GenericTensorAccessorW logit_tensor_backing = this->training_backing
-      .realm_tensor_backing.tensor_backings.at(TensorTypeVariant{this->logit_tensor});
-
-
-  gradient_tensor_t loss_tensor =
-      this->training_backing.realm_tensor_backing.tensor_gradient_mapping.at(
-          this->logit_tensor);
+  gradient_tensor_guid_t loss_tensor = get_gradient_tensor_guid_for_tensor_guid(
+      this->training_backing.training_computation_graph,
+      this->training_backing.training_computation_graph.logit_tensor);
   GenericTensorAccessorW loss_tensor_backing =
-      this->training_backing.realm_tensor_backing.tensor_backings.at(
-          TensorTypeVariant{loss_tensor});
-  
+      this->training_backing.local_tensor_backing
+          .backing_for_training_tensor_map.at(
+              training_tensor_guid_t{loss_tensor});
   return read_only_accessor_from_write_accessor(loss_tensor_backing);
 }
 
diff --git a/lib/realm-backend/src/realm_args_backing.cc b/lib/realm-backend/src/realm_args_backing.cc
deleted file mode 100644
index d30793a801..0000000000
--- a/lib/realm-backend/src/realm_args_backing.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-#include "op-attrs/parallel_tensor_shape.h"
-#include "realm-backend/realm_args_backing.h"
-#include "task-spec/op_task_to_task_invocation.h"
-#include "utils/containers/contains_key.h"
-#include "utils/containers/map_values.h"
-#include "utils/overload.h"
-
-namespace FlexFlow {
-
-RealmArgsBacking make_args_backing_with_empty_device_states(
-  RuntimeArgConfig const &runtime_arg_config) {
-return RealmArgsBacking{runtime_arg_config, {}};
-}
-
-RealmArgsBacking::RealmArgsBacking(
-  RuntimeArgConfig const &runtime_arg_config,
-  std::unordered_map<layer_guid_t, DeviceSpecificDeviceStates> const
-      &device_states)
-  : runtime_arg_config(runtime_arg_config),
-    per_device_op_states(device_states){};
-
-std::optional<DeviceSpecificDeviceStates> get_per_device_op_state_if_exists(
-  RealmArgsBacking const &realm_args_backing,
-    layer_guid_t const &layer_guid) {
-  if (contains_key(realm_args_backing.per_device_op_states, layer_guid)) {
-    return realm_args_backing.per_device_op_states.at(layer_guid);
-  } else {
-    return std::nullopt;
-  }
-}
-
-ArgSlotsBacking
-    construct_arg_slots_backing(TaskBinding const &binding,
-                                RuntimeArgConfig const &runtime_arg_config) {
-  return map_values(
-      binding.get_arg_bindings(), [&](TaskArgSpec const &arg_binding) {
-        return arg_binding.template visit<ConcreteArgSpec>(
-            overload{[&](RuntimeArgRefSpec const &s) {
-                       return lower_to_concrete_arg_spec(s, runtime_arg_config);
-                     },
-                     [](ConcreteArgSpec const &s) { return s; }});
-      });
-  ;
-}
-
-} // namespace FlexFlow
diff --git a/lib/realm-backend/src/realm_task_argument_accessor.cc b/lib/realm-backend/src/realm_task_argument_accessor.cc
deleted file mode 100644
index b7f10772e0..0000000000
--- a/lib/realm-backend/src/realm_task_argument_accessor.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-#include "realm-backend/realm_task_argument_accessor.h"
-#include "utils/containers/contains_key.h"
-#include "utils/containers/transform.h"
-#include "utils/hash/pair.h"
-#include "utils/overload.h"
-
-namespace FlexFlow {
-
-RealmTaskArgumentAccessor::RealmTaskArgumentAccessor(
-    Allocator const &allocator,
-    TensorSlotsBacking const &tensor_slots_backing,
-    ArgSlotsBacking const &arg_slots_backing)
-    : allocator(allocator), tensor_slots_backing(tensor_slots_backing),
-      arg_slots_backing(arg_slots_backing){};
-
-ConcreteArgSpec const &
-    RealmTaskArgumentAccessor::get_concrete_arg(slot_id_t name) const {
-  return this->arg_slots_backing.at(name);
-}
-
-GenericTensorAccessor RealmTaskArgumentAccessor::get_tensor(
-    slot_id_t slot, Permissions priv, TensorType tensor_type) const {
-  SlotTensorTypeId slot_tensor_type = SlotTensorTypeId{slot, tensor_type};
-  auto tensor_backing = std::get<GenericTensorAccessorW>(
-      this->tensor_slots_backing.at(slot_tensor_type));
-  if (priv == Permissions::RO) {
-    GenericTensorAccessorR readonly_tensor_backing =
-        read_only_accessor_from_write_accessor(tensor_backing);
-    return readonly_tensor_backing;
-  } else if (priv == Permissions::RW || priv == Permissions::WO) {
-    return tensor_backing;
-  } else {
-    throw mk_runtime_error(fmt::format("Unhandled privilege mode {}", priv));
-  }
-}
-
-VariadicGenericTensorAccessor RealmTaskArgumentAccessor::get_variadic_tensor(
-    slot_id_t slot, Permissions priv, TensorType tensor_type) const {
-  SlotTensorTypeId slot_tensor_type = SlotTensorTypeId{slot, tensor_type};
-  auto variadic_tensor_backing = std::get<std::vector<GenericTensorAccessorW>>(
-      this->tensor_slots_backing.at(slot_tensor_type));
-  if (priv == Permissions::RO) {
-    std::vector<GenericTensorAccessorR> readonly_variadic_tensor_backing = {};
-    for (GenericTensorAccessorW const &tensor_backing :
-         variadic_tensor_backing) {
-      readonly_variadic_tensor_backing.push_back(
-          read_only_accessor_from_write_accessor(tensor_backing));
-    }
-    return readonly_variadic_tensor_backing;
-  } else if (priv == Permissions::RW || priv == Permissions::WO) {
-    return variadic_tensor_backing;
-  } else {
-    throw mk_runtime_error(fmt::format("Unhandled privilege mode {}", priv));
-  }
-}
-
-Allocator RealmTaskArgumentAccessor::get_allocator() const {
-  return this->allocator;
-}
-
-size_t RealmTaskArgumentAccessor::get_device_idx() const {
-  return 0;
-}
-
-} // namespace FlexFlow
diff --git a/lib/realm-backend/src/realm_tensor_backing.cc b/lib/realm-backend/src/realm_tensor_backing.cc
deleted file mode 100644
index 5dcfa8cef8..0000000000
--- a/lib/realm-backend/src/realm_tensor_backing.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-#include "op-attrs/parallel_tensor_shape.h"
-#include "pcg/computation_graph.h"
-#include "pcg/optimizer_attrs.h"
-#include "realm-backend/realm_tensor_backing.h"
-#include "task-spec/slot_grad_id.dtg.h"
-#include "utils/containers/contains_key.h"
-#include "utils/containers/keys.h"
-#include "utils/overload.h"
-
-namespace FlexFlow {
-
-GenericTensorAccessorW
-get_tensor(RealmTensorBacking const &realm_tensor_backing,
-           TensorTypeVariant const &tensor_type) {
-  return realm_tensor_backing.tensor_backings.at(tensor_type);
-}
-
-std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
-merge_optimizer_mappings(
-    std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>> const
-        &allocated,
-    std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>> const
-        &unallocated) {
-  std::unordered_map<tensor_guid_t, std::vector<optimizer_tensor_t>>
-      merged_maps = allocated;
-  for (std::pair<tensor_guid_t, std::vector<optimizer_tensor_t>> const
-           &unallocated_optimizer_tensors : unallocated) {
-    if (merged_maps.count(unallocated_optimizer_tensors.first)) {
-      for (optimizer_tensor_t const &optimizer_tensor :
-           unallocated_optimizer_tensors.second) {
-        merged_maps[unallocated_optimizer_tensors.first].push_back(
-            optimizer_tensor);
-      }
-    } else {
-      merged_maps.insert({unallocated_optimizer_tensors});
-    }
-  }
-  return merged_maps;
-}
-
-std::unordered_map<TensorTypeVariant, GenericTensorAccessorW>
-get_tensor_backings(
-    std::unordered_map<TensorTypeVariant, GenericTensorAccessorW> const
-        &tensor_type_backings,
-    std::unordered_map<TensorTypeVariant, TensorShape> const
-        &tensor_type_shapes,
-    Allocator &allocator) {
-  std::unordered_map<TensorTypeVariant, GenericTensorAccessorW>
-      all_tensor_backings = tensor_type_backings;
-
-  // allocate new tensors
-  for (std::pair<TensorTypeVariant, TensorShape> const &tensor_type_shape :
-       tensor_type_shapes) {
-    GenericTensorAccessorW tensor_backing =
-        allocator.allocate_tensor(tensor_type_shape.second);
-    all_tensor_backings.insert({tensor_type_shape.first, tensor_backing});
-  }
-
-  return all_tensor_backings;
-}
-
-RealmTensorBacking
-construct_realm_tensor_backing(AllocatedTensors const &allocated_tensors,
-                               UnallocatedTensors const &unallocated_tensors,
-                               Allocator &allocator) {
-
-  std::unordered_map<tensor_guid_t, gradient_tensor_t> merged_gradient_maps =
-      allocated_tensors.gradient_mapping;
-  merged_gradient_maps.insert(unallocated_tensors.gradient_mapping.begin(),
-                              unallocated_tensors.gradient_mapping.end());
-
-  return RealmTensorBacking{
-      get_tensor_backings(allocated_tensors.tensor_type_backings,
-                          unallocated_tensors.tensor_type_shapes, allocator),
-      merged_gradient_maps,
-      merge_optimizer_mappings(allocated_tensors.optimizer_mapping,
-                               unallocated_tensors.optimizer_mapping)};
-}
-
-TensorSlotsBacking
-construct_tensor_slots_backing(RealmTensorBacking const &realm_tensor_backing,
-                               TaskBinding const &binding) {
-  TensorSlotsBacking mapping;
-
-  for (std::pair<SlotTensorTypeId, TensorTypeVariant> const &tensor_binding :
-       binding.get_tensor_bindings()) {
-    mapping.insert({tensor_binding.first,
-                    get_tensor(realm_tensor_backing, tensor_binding.second)});
-  }
-
-  return mapping;
-}
-
-} // namespace FlexFlow
\ No newline at end of file
diff --git a/lib/realm-backend/src/realm_training_backing copy.cc b/lib/realm-backend/src/realm_training_backing copy.cc
new file mode 100644
index 0000000000..e6a3079a25
--- /dev/null
+++ b/lib/realm-backend/src/realm_training_backing copy.cc	
@@ -0,0 +1,126 @@
+// #include "kernels/allocation.h"
+// #include "local-execution/loss_functions.h"
+// #include "local-execution/optimizer.h"
+// #include "pcg/computation_graph.dtg.h"
+// #include "pcg/computation_graph.h"
+// #include "pcg/optimizer_attrs.h"
+// #include "realm-backend/realm_tensor_backing.h"
+// #include "task-spec/op_task_to_task_invocation.h"
+// #include "task-spec/runtime_arg_config.h"
+// #include "task-spec/task_invocation.h"
+// #include "task-spec/task_signature_impl.h"
+// #include "utils/containers/contains.h"
+// #include "utils/containers/contains_key.h"
+// #include "utils/containers/get_only.h"
+// #include "utils/containers/values.h"
+// #include "utils/exception.h"
+
+// #include "realm-backend/realm_training_backing.h"
+// #include "realm-backend/task_result.h"
+// #include "realm-backend/task_wrapper.h"
+
+// namespace FlexFlow {
+
+// using namespace Realm;
+
+// RealmTrainingBacking::RealmTrainingBacking(
+//     Processor master_proc, std::vector<Processor> const &worker_procs,
+//     std::vector<Allocator> const &allocators,
+//     AllocatedTensors const &allocated_tensors,
+//     GradientTensorSource &gradient_tensor_source,
+//     ComputationGraph const &computation_graph,
+//     RuntimeArgConfig const &runtime_arg_config)
+//     : master_proc(master_proc), master_event(Realm::Event::NO_EVENT),
+//       master_mem(Machine::MemoryQuery(Machine::get_machine())
+//                      .only_kind(Memory::SYSTEM_MEM)
+//                      .best_affinity_to(master_proc)
+//                      .first()),
+//     worker_procs(worker_procs),
+//     worker_events(std::vector<Realm::Event>(worker_procs.size(),
+//                                            Realm::Event::NO_EVENT)),
+//       allocators(allocators), computation_graph(computation_graph),
+//       task_registry(construct_task_registry_and_register_tasks_for_realm(
+//           computation_graph, worker_procs)),
+//       realm_tensor_backing(construct_realm_tensor_backing( // TODO: multi gpu
+//         allocated_tensors,
+//         generate_unallocated_tensors(
+//             allocated_tensors, get_all_tensor_attrs(computation_graph),
+//             gradient_tensor_source),
+//         this->allocators[0])),
+//       realm_args_backing(initialize_args_backing(this, computation_graph, runtime_arg_config)) {}
+
+// TaskRegistry construct_task_registry_and_register_tasks_for_realm(
+//     ComputationGraph const &cg, std::vector<Realm::Processor> const &worker_procs) {
+//   TaskRegistry task_registry = construct_task_registry(
+//     get_layer_attrs_mapping(cg));
+
+//   // register tasks for realm
+//   std::unordered_map<layer_guid_t, LayerAttrs> const &layer_attrs_mapping =
+//       get_layer_attrs_mapping(cg);
+//   for (std::pair<layer_guid_t, LayerAttrs> const &layer_attrs :
+//       layer_attrs_mapping) {
+//     ComputationGraphOpAttrs attrs = layer_attrs.second.op_attrs;
+//     std::vector<task_id_t> task_ids = get_task_ids(attrs);
+//     for (task_id_t task_id : task_ids) {
+//         TaskSignatureAndImpl task_signature_impl = get_task_sig_impl(task_id);
+//         // TODO: multi gpu
+//         register_wrapper_tasks(0, worker_procs[0], task_id, task_signature_impl);
+//     }
+//   }
+
+//   return task_registry;
+// }
+
+// RealmArgsBacking
+// initialize_args_backing(RealmTrainingBacking *backing,
+//                         ComputationGraph const &cg,
+//                         RuntimeArgConfig const &runtime_arg_config) {
+//   std::unordered_map<layer_guid_t, DeviceSpecificDeviceStates>
+//       per_device_op_states;
+//   TaskRegistry const &task_registry = backing->task_registry;
+//   RealmTensorBacking const &realm_tensor_backing =
+//       backing->realm_tensor_backing;
+//   Processor master_proc = backing->master_proc;
+//   Memory master_mem = backing->master_mem;
+//   std::vector<Processor> &worker_procs = backing->worker_procs;
+//   std::vector<Event> &worker_events = backing->worker_events;
+//   // TODO: multi gpu
+//   Allocator &allocator = backing->allocators[0];
+
+//   for (layer_guid_t const &node : topological_ordering(cg)) {
+//     if (registry_contains_task_for_layer(task_registry, node,
+//                                          OpTaskType::INIT)) {
+//       ComputationGraphOpAttrs attrs = get_layer_attrs(cg, node).op_attrs;
+
+//       TaskInvocation invocation = lower_to_task_invocation(
+//           init(attrs), node, get_incoming_inputs(cg, node),
+//           get_incoming_input_shapes(cg, node), get_outgoing_tensors(cg, node),
+//           get_incoming_weights(cg, node),
+//           realm_tensor_backing.tensor_gradient_mapping, std::nullopt);
+//       TaskArgumentAccessor accessor = get_task_arg_accessor(
+//           realm_tensor_backing,
+//           make_args_backing_with_empty_device_states(runtime_arg_config),
+//           invocation,
+//           allocator);
+//       task_id_t task_id = invocation.task_id;
+//       TaskImplFunction impl_function =
+//           task_registry.task_mapping.at(task_id).impl_function;
+//       // TODO: multi gpu launching
+//       Promise<DeviceSpecificDeviceStates> promise = Promise<DeviceSpecificDeviceStates>();
+//       Future<DeviceSpecificDeviceStates> future = promise.get_future();
+//       RealmTaskArgs<DeviceSpecificDeviceStates>* task_arg = new RealmTaskArgs<DeviceSpecificDeviceStates>{
+//           task_id, impl_function, accessor, std::move(promise)};
+//       uintptr_t args[1] = {reinterpret_cast<uintptr_t>(task_arg)};
+//       Event e =
+//           worker_procs[0].spawn(get_realm_task_id(task_id),
+//                                 args, sizeof(uintptr_t), worker_events[0]);
+//       worker_events[0] = e;
+//       future.set_event(e);
+//       per_device_op_states.insert({node, future.get().value()});
+//     }
+//   }
+
+//   return RealmArgsBacking{runtime_arg_config, per_device_op_states};
+// }
+
+// } // namespace FlexFlow
diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc
index 053bf62838..b436443cdb 100644
--- a/lib/realm-backend/src/realm_training_backing.cc
+++ b/lib/realm-backend/src/realm_training_backing.cc
@@ -1,20 +1,19 @@
-#include "kernels/allocation.h"
-#include "local-execution/loss_functions.h"
-#include "local-execution/optimizer.h"
-#include "pcg/computation_graph.dtg.h"
+#include "local-execution/local_args_backing.h"
 #include "pcg/computation_graph.h"
 #include "pcg/optimizer_attrs.h"
-#include "realm-backend/realm_tensor_backing.h"
+#include "task-spec/loss_functions.h"
 #include "task-spec/op_task_to_task_invocation.h"
-#include "task-spec/runtime_arg_config.h"
+#include "task-spec/optimizer.h"
 #include "task-spec/task_invocation.h"
 #include "task-spec/task_signature_impl.h"
+#include "task-spec/training_computation_graph.h"
 #include "utils/containers/contains.h"
 #include "utils/containers/contains_key.h"
 #include "utils/containers/get_only.h"
+#include "utils/containers/is_subseteq_of.h"
+#include "utils/containers/keys.h"
 #include "utils/containers/values.h"
 #include "utils/exception.h"
-
 #include "realm-backend/realm_training_backing.h"
 #include "realm-backend/task_result.h"
 #include "realm-backend/task_wrapper.h"
@@ -23,327 +22,292 @@ namespace FlexFlow {
 
 using namespace Realm;
 
-RealmTrainingBacking::RealmTrainingBacking(
-    Processor master_proc, std::vector<Processor> const &worker_procs,
-    std::vector<Allocator> const &allocators,
-    AllocatedTensors const &allocated_tensors,
-    GradientTensorSource &gradient_tensor_source,
-    ComputationGraph const &computation_graph,
-    RuntimeArgConfig const &runtime_arg_config)
-    : master_proc(master_proc), master_event(Realm::Event::NO_EVENT),
-      master_mem(Machine::MemoryQuery(Machine::get_machine())
-                     .only_kind(Memory::SYSTEM_MEM)
-                     .best_affinity_to(master_proc)
-                     .first()),
-    worker_procs(worker_procs),
-    worker_events(std::vector<Realm::Event>(worker_procs.size(),
-                                           Realm::Event::NO_EVENT)),
-      allocators(allocators), computation_graph(computation_graph),
-      task_registry(construct_task_registry_and_register_tasks_for_realm(
-          computation_graph, worker_procs)),
-      realm_tensor_backing(construct_realm_tensor_backing( // TODO: multi gpu
-        allocated_tensors,
-        generate_unallocated_tensors(
-            allocated_tensors, get_all_tensor_attrs(computation_graph),
-            gradient_tensor_source),
-        this->allocators[0])),
-      realm_args_backing(initialize_args_backing(this, computation_graph, runtime_arg_config)) {}
-
-RealmTrainingBacking::RealmTrainingBacking(
-    Processor master_proc, std::vector<Processor> const &worker_procs,
-    std::vector<Allocator> const &allocators,
-    AllocatedTensors const &allocated_tensors,
-    GradientTensorSource &gradient_tensor_source,
-    OptimizerTensorSource &optimizer_tensor_source,
-    ComputationGraph const &computation_graph,
+LocalTrainingBacking make_local_training_backing_for_computation_graph(
+    RealmRuntimeState &runtime_state,
+    std::unordered_map<training_tensor_guid_t, GenericTensorAccessorW> const
+        &preallocated,
+    TrainingComputationGraph const &training_computation_graph,
     RuntimeArgConfig const &runtime_arg_config,
-    OptimizerAttrs const &optimizer_attrs)
-    : master_proc(master_proc), master_event(Realm::Event::NO_EVENT),
-      master_mem(Machine::MemoryQuery(Machine::get_machine())
-                     .only_kind(Memory::SYSTEM_MEM)
-                     .best_affinity_to(master_proc)
-                     .first()),
-    worker_procs(worker_procs),
-    worker_events(std::vector<Realm::Event>(worker_procs.size(),
-                                           Realm::Event::NO_EVENT)),
-      allocators(allocators), computation_graph(computation_graph),
-      task_registry(construct_task_registry_and_register_tasks_for_realm(
-          computation_graph, worker_procs)),
-    realm_tensor_backing(construct_realm_tensor_backing( // TODO: multi gpu
-        allocated_tensors,
-        generate_unallocated_tensors_with_optimizer(
-            allocated_tensors, get_all_tensor_attrs(computation_graph),
-            gradient_tensor_source, optimizer_tensor_source,
-            optimizer_attrs),
-        this->allocators[0])),
-      realm_args_backing(initialize_args_backing(this, computation_graph, runtime_arg_config)) {}
-
-TaskRegistry construct_task_registry_and_register_tasks_for_realm(
-    ComputationGraph const &cg, std::vector<Realm::Processor> const &worker_procs) {
-  TaskRegistry task_registry = construct_task_registry(
-    get_layer_attrs_mapping(cg));
-
-  // register tasks for realm
-  std::unordered_map<layer_guid_t, LayerAttrs> const &layer_attrs_mapping =
-      get_layer_attrs_mapping(cg);
-  for (std::pair<layer_guid_t, LayerAttrs> const &layer_attrs :
-      layer_attrs_mapping) {
-    ComputationGraphOpAttrs attrs = layer_attrs.second.op_attrs;
-    std::vector<task_id_t> task_ids = get_task_ids(attrs);
-    for (task_id_t task_id : task_ids) {
-        TaskSignatureAndImpl task_signature_impl = get_task_sig_impl(task_id);
-        // TODO: multi gpu
-        register_wrapper_tasks(0, worker_procs[0], task_id, task_signature_impl);
-    }
-  }
+    OptimizerAttrs const &optimizer_attrs) {
+
+  ASSERT(is_subseteq_of(                           
+      keys(preallocated),
+      keys(get_all_training_tensor_shapes(training_computation_graph))));
+
+  LocalTaskRegistry local_task_registry =
+      construct_local_task_registry_for_layers(get_layer_attrs_mapping(
+          training_computation_graph.computation_graph));
+
+  register_tasks_for_realm(local_task_registry, runtime_state);
+
+  LocalTensorBacking local_tensor_backing = construct_local_tensor_backing(
+      get_all_training_tensor_shapes(training_computation_graph),
+      preallocated,
+      runtime_state.allocators[0]);
+
+  std::unordered_map<layer_guid_t, std::optional<DeviceSpecificDeviceStates>>
+      per_device_op_states = generate_map(
+          topological_ordering(training_computation_graph.computation_graph),
+          [&](layer_guid_t const &layer_guid) {
+            return create_per_device_op_state(
+                local_task_registry,
+                local_tensor_backing,
+                runtime_arg_config,
+                runtime_state,
+                get_training_layer_plus_context(training_computation_graph,
+                                                layer_guid));
+          });
 
-  return task_registry;
+  LocalArgsBacking local_args_backing =
+      make_local_args_backing_for_computation_graph(runtime_arg_config,
+                                                    per_device_op_states);
+
+  return LocalTrainingBacking{
+      /*computation_graph=*/training_computation_graph,
+      /*local_task_registry=*/local_task_registry,
+      /*local_tensor_backing=*/local_tensor_backing,
+      /*local_args_backing=*/local_args_backing,
+  };
 }
 
-RealmArgsBacking
-initialize_args_backing(RealmTrainingBacking *backing,
-                        ComputationGraph const &cg,
-                        RuntimeArgConfig const &runtime_arg_config) {
-  std::unordered_map<layer_guid_t, DeviceSpecificDeviceStates>
-      per_device_op_states;
-  TaskRegistry const &task_registry = backing->task_registry;
-  RealmTensorBacking const &realm_tensor_backing =
-      backing->realm_tensor_backing;
-  Processor master_proc = backing->master_proc;
-  Memory master_mem = backing->master_mem;
-  std::vector<Processor> &worker_procs = backing->worker_procs;
-  std::vector<Event> &worker_events = backing->worker_events;
-  // TODO: multi gpu
-  Allocator &allocator = backing->allocators[0];
-
-  for (layer_guid_t const &node : topological_ordering(cg)) {
-    if (registry_contains_task_for_layer(task_registry, node,
-                                         OpTaskType::INIT)) {
-      ComputationGraphOpAttrs attrs = get_layer_attrs(cg, node).op_attrs;
-
-      TaskInvocation invocation = lower_to_task_invocation(
-          init(attrs), node, get_incoming_inputs(cg, node),
-          get_incoming_input_shapes(cg, node), get_outgoing_tensors(cg, node),
-          get_incoming_weights(cg, node),
-          realm_tensor_backing.tensor_gradient_mapping, std::nullopt);
-      TaskArgumentAccessor accessor = get_task_arg_accessor(
-          realm_tensor_backing,
-          make_args_backing_with_empty_device_states(runtime_arg_config),
-          invocation,
-          allocator);
-      task_id_t task_id = invocation.task_id;
-      TaskImplFunction impl_function =
-          task_registry.task_mapping.at(task_id).impl_function;
-      // TODO: multi gpu launching
-      Promise<DeviceSpecificDeviceStates> promise = Promise<DeviceSpecificDeviceStates>();
-      Future<DeviceSpecificDeviceStates> future = promise.get_future();
-      RealmTaskArgs<DeviceSpecificDeviceStates>* task_arg = new RealmTaskArgs<DeviceSpecificDeviceStates>{
-          task_id, impl_function, accessor, std::move(promise)};
-      uintptr_t args[1] = {reinterpret_cast<uintptr_t>(task_arg)};
-      Event e =
-          worker_procs[0].spawn(get_realm_task_id(task_id),
-                                args, sizeof(uintptr_t), worker_events[0]);
-      worker_events[0] = e;
-      future.set_event(e);
-      per_device_op_states.insert({node, future.get().value()});
+// register tasks for realm runtime
+void register_tasks_for_realm(LocalTaskRegistry const &local_task_registry, RealmRuntimeState &runtime_state) {
+    for (std::pair<task_id_t, TaskSignatureAndImpl> const &task : local_task_registry.task_mapping) {
+        task_id_t task_id = task.first;
+        TaskSignatureAndImpl task_signature_impl = task.second;
+        // TODO: multi gpu
+        register_wrapper_tasks(0, runtime_state.worker_procs[0], task_id, task_signature_impl);
     }
+}
+
+std::optional<DeviceSpecificDeviceStates>
+    create_per_device_op_state(LocalTaskRegistry const &local_task_registry,
+                               LocalTensorBacking const &tensor_backing,
+                               RuntimeArgConfig const &runtime_arg_config,
+                               RealmRuntimeState &runtime_state,
+                               TrainingLayerPlusContext const &training_layer) {
+  std::optional maybe_registered_task = try_get_registered_task(
+      local_task_registry, training_layer.layer_guid, OpTaskType::INIT);
+
+  ASSERT(maybe_registered_task.has_value());
+
+  registered_task_t registered_task = maybe_registered_task.value();
+  if (registered_task.is_noop_task()) {
+    return std::nullopt;
   }
 
-  return RealmArgsBacking{runtime_arg_config, per_device_op_states};
+  TaskInvocation invocation = lower_to_task_invocation(
+      /*op_task_invocation=*/get_init_op_task_invocation(
+          training_layer.layer_attrs.op_attrs),
+      /*training_layer=*/training_layer,
+      /*device_specific_device_states=*/std::nullopt);
+
+  TaskArgumentAccessor accessor = get_task_arg_accessor(
+      tensor_backing, runtime_arg_config, invocation, runtime_state.allocators[0]);
+
+  task_id_t task_id = invocation.task_id;
+  TaskImplFunction impl_function =
+      local_task_registry.task_mapping.at(task_id).impl_function;
+  // TODO: multi gpu launching
+  Promise<DeviceSpecificDeviceStates> promise = Promise<DeviceSpecificDeviceStates>();
+  Future<DeviceSpecificDeviceStates> future = promise.get_future();
+  RealmTaskArgs<DeviceSpecificDeviceStates>* task_arg = 
+                        new RealmTaskArgs<DeviceSpecificDeviceStates>{
+                            task_id, impl_function, accessor,
+                            std::move(promise)};
+  uintptr_t args[1] = {reinterpret_cast<uintptr_t>(task_arg)};
+  Event e = runtime_state.worker_procs[0].spawn(
+      get_realm_task_id(task_id), args, sizeof(uintptr_t),
+      runtime_state.worker_events[0]);
+  runtime_state.worker_events[0] = e;
+  future.set_event(e);
+  return future.get().value();
 }
 
-Future<float>
-execute_forward(RealmTrainingBacking &realm_training_backing,
-                layer_guid_t const &operator_node) {
-  if (registry_contains_task_for_layer(realm_training_backing.task_registry,
-                                       operator_node, OpTaskType::FWD)) {
-    ComputationGraphOpAttrs attrs =
-        get_layer_attrs(realm_training_backing.computation_graph, operator_node)
-            .op_attrs;
-    std::optional<DeviceSpecificDeviceStates> device_state =
-        get_per_device_op_state_if_exists(
-            realm_training_backing.realm_args_backing, operator_node);
-    TaskInvocation invocation = lower_to_task_invocation(
-        forward(attrs), operator_node,
-        get_incoming_inputs(realm_training_backing.computation_graph,
-                            operator_node),
-        get_incoming_input_shapes(realm_training_backing.computation_graph,
-                                  operator_node),
-        get_outgoing_tensors(realm_training_backing.computation_graph,
-                             operator_node),
-        get_incoming_weights(realm_training_backing.computation_graph,
-                             operator_node),
-        realm_training_backing.realm_tensor_backing.tensor_gradient_mapping,
-        device_state);
-    TaskArgumentAccessor accessor = get_task_arg_accessor(
-        realm_training_backing.realm_tensor_backing,
-        realm_training_backing.realm_args_backing, invocation,
-        realm_training_backing.allocators[0]);
-    task_id_t task_id = invocation.task_id;
-    TaskImplFunction impl_function =
-        realm_training_backing.task_registry.task_mapping.at(task_id)
-            .impl_function;
-    // TODO: multi gpu launching
-    Promise<float> promise(realm_training_backing.master_mem);
-    Future<float> future = promise.get_future();
-    RealmTaskArgs<float>* task_arg = new RealmTaskArgs<float>{task_id, impl_function, accessor,
-                                        std::move(promise)};
-    uintptr_t args[1] = {reinterpret_cast<uintptr_t>(task_arg)};
-    Event e = realm_training_backing.worker_procs[0].spawn(
-        get_realm_task_id(task_id), args, sizeof(uintptr_t),
-        realm_training_backing.worker_events[0]);
-    realm_training_backing.worker_events[0] = e;
-    future.set_event(e);
-    return future;
-  } else {
-    return Future<float>(0.0f);
+Future<std::optional<milliseconds_t>>
+    execute_forward(LocalTaskRegistry const &local_task_registry,
+                    LocalTensorBacking const &local_tensor_backing,
+                    LocalArgsBacking const &local_args_backing,
+                    TrainingLayerPlusContext const &training_layer,
+                    RealmRuntimeState &runtime_state) {
+
+  std::optional maybe_registered_task = try_get_registered_task(
+      local_task_registry, training_layer.layer_guid, OpTaskType::BWD);
+
+  ASSERT(maybe_registered_task.has_value());
+
+  registered_task_t registered_task = maybe_registered_task.value();
+  if (registered_task.is_noop_task()) {
+    return Future<std::optional<milliseconds_t>>(std::nullopt);
   }
+
+  std::optional<DeviceSpecificDeviceStates> device_state =
+      get_per_device_op_state_if_exists(local_args_backing,
+                                        training_layer.layer_guid);
+
+  TaskInvocation invocation = lower_to_task_invocation(
+      /*op_task_invocation=*/get_forward_op_task_invocation(
+          training_layer.layer_attrs.op_attrs),
+      /*training_layer=*/training_layer,
+      /*device_specific_device_states=*/device_state);
+
+  TaskArgumentAccessor accessor =
+      get_task_arg_accessor(local_tensor_backing,
+                            local_args_backing.runtime_arg_config,
+                            invocation,
+                            runtime_state.allocators[0]);
+
+  task_id_t task_id = invocation.task_id;
+  TaskImplFunction impl_function =
+      local_task_registry.task_mapping.at(task_id).impl_function;
+  // TODO: multi gpu launching
+  Promise<std::optional<milliseconds_t>> promise(runtime_state.master_mem);
+  Future<std::optional<milliseconds_t>> future = promise.get_future();
+  RealmTaskArgs<std::optional<milliseconds_t>>* task_arg = 
+                        new RealmTaskArgs<std::optional<milliseconds_t>>{
+                            task_id, impl_function, accessor,
+                            std::move(promise)};
+  uintptr_t args[1] = {reinterpret_cast<uintptr_t>(task_arg)};
+  Event e = runtime_state.worker_procs[0].spawn(
+      get_realm_task_id(task_id), args, sizeof(uintptr_t),
+      runtime_state.worker_events[0]);
+  runtime_state.worker_events[0] = e;
+  future.set_event(e);
+  return future;
 }
 
-Future<float>
-execute_backward(RealmTrainingBacking &realm_training_backing,
-                 layer_guid_t const &operator_node) {
-  if (registry_contains_task_for_layer(realm_training_backing.task_registry,
-                                       operator_node, OpTaskType::BWD)) {
-    ComputationGraphOpAttrs attrs =
-        get_layer_attrs(realm_training_backing.computation_graph, operator_node)
-            .op_attrs;
-    std::optional<DeviceSpecificDeviceStates> device_state =
-        get_per_device_op_state_if_exists(
-            realm_training_backing.realm_args_backing, operator_node);
-    TaskInvocation invocation = lower_to_task_invocation(
-        forward(attrs), operator_node,
-        get_incoming_inputs(realm_training_backing.computation_graph,
-                            operator_node),
-        get_incoming_input_shapes(realm_training_backing.computation_graph,
-                                  operator_node),
-        get_outgoing_tensors(realm_training_backing.computation_graph,
-                             operator_node),
-        get_incoming_weights(realm_training_backing.computation_graph,
-                             operator_node),
-        realm_training_backing.realm_tensor_backing.tensor_gradient_mapping,
-        device_state);
-    TaskArgumentAccessor accessor = get_task_arg_accessor(
-        realm_training_backing.realm_tensor_backing,
-        realm_training_backing.realm_args_backing, invocation,
-        realm_training_backing.allocators[0]);
-    task_id_t task_id = invocation.task_id;
-    TaskImplFunction impl_function =
-        realm_training_backing.task_registry.task_mapping.at(task_id)
-            .impl_function;
-    // TODO: multi gpu launching
-    Promise<float> promise(realm_training_backing.master_mem);
-    Future<float> future = promise.get_future();
-    RealmTaskArgs<float>* task_arg = new RealmTaskArgs<float>{task_id, impl_function, accessor,
-                                        std::move(promise)};
-    uintptr_t args[1] = {reinterpret_cast<uintptr_t>(task_arg)};
-    Event e = realm_training_backing.worker_procs[0].spawn(
-        get_realm_task_id(task_id), args, sizeof(uintptr_t),
-        realm_training_backing.worker_events[0]);
-    realm_training_backing.worker_events[0] = e;
-    future.set_event(e);
-    return future;
-  } else {
-    return Future<float>(0.0f);
+Future<std::optional<milliseconds_t>>
+    execute_backward(LocalTaskRegistry const &local_task_registry,
+                     LocalTensorBacking const &local_tensor_backing,
+                     LocalArgsBacking const &local_args_backing,
+                     TrainingLayerPlusContext const &training_layer,
+                     RealmRuntimeState &runtime_state) {
+
+  std::optional maybe_registered_task = try_get_registered_task(
+      local_task_registry, training_layer.layer_guid, OpTaskType::BWD);
+
+  ASSERT(maybe_registered_task.has_value());
+
+  registered_task_t registered_task = maybe_registered_task.value();
+  if (registered_task.is_noop_task()) {
+    return Future<std::optional<milliseconds_t>>(std::nullopt);
   }
+
+  std::optional<DeviceSpecificDeviceStates> device_state =
+      get_per_device_op_state_if_exists(local_args_backing,
+                                        training_layer.layer_guid);
+  TaskInvocation invocation = lower_to_task_invocation(
+      get_backward_op_task_invocation(training_layer.layer_attrs.op_attrs),
+      training_layer,
+      device_state);
+  TaskArgumentAccessor accessor =
+      get_task_arg_accessor(local_tensor_backing,
+                            local_args_backing.runtime_arg_config,
+                            invocation,
+                            runtime_state.allocators[0]);
+
+  task_id_t task_id = invocation.task_id;
+  TaskImplFunction impl_function =
+      local_task_registry.task_mapping.at(task_id).impl_function;
+  // TODO: multi gpu launching
+  Promise<std::optional<milliseconds_t>> promise(runtime_state.master_mem);
+  Future<std::optional<milliseconds_t>> future = promise.get_future();
+  RealmTaskArgs<std::optional<milliseconds_t>>* task_arg = 
+                                new RealmTaskArgs<std::optional<milliseconds_t>>{
+                                    task_id, impl_function, accessor,
+                                    std::move(promise)};
+  uintptr_t args[1] = {reinterpret_cast<uintptr_t>(task_arg)};
+  Event e = runtime_state.worker_procs[0].spawn(
+      get_realm_task_id(task_id), args, sizeof(uintptr_t),
+      runtime_state.worker_events[0]);
+  runtime_state.worker_events[0] = e;
+  future.set_event(e);
+  return future;
 }
 
-Future<void> execute_update(RealmTrainingBacking &realm_training_backing,
-                            layer_guid_t const &node,
-                            OptimizerAttrs const &optimizer_attrs) {
-  LayerAttrs layer_attrs =
-      get_layer_attrs(realm_training_backing.computation_graph, node);
-  if (layer_attrs.op_attrs.has<WeightAttrs>()) {
-    // get tensors
-    tensor_guid_t weight_tensor = get_only(
-        get_outgoing_tensors(realm_training_backing.computation_graph, node));
-
-    gradient_tensor_t weight_grad_tensor =
-        realm_training_backing.realm_tensor_backing.tensor_gradient_mapping.at(
-            weight_tensor);
-    std::vector<optimizer_tensor_t> optimizer_buffer_tensors =
-        realm_training_backing.realm_tensor_backing.tensor_optimizer_mapping.at(
-            weight_tensor);
-
-    // get invocation
+Future<void> execute_update(LocalTrainingBacking const &local_training_backing,
+                    layer_guid_t const &layer_guid,
+                    OptimizerAttrs const &optimizer_attrs,
+                    RealmRuntimeState &runtime_state) {
+  TrainingLayerPlusContext training_layer = get_training_layer_plus_context(
+      local_training_backing.training_computation_graph, layer_guid);
+
+  if (training_layer.layer_attrs.op_attrs.has<WeightAttrs>()) {
+    TrainingTensorGroupWithAttrs weight_tensor_group =
+        get_only(training_layer.output_tensor_groups);
+
     TaskInvocation invocation =
-        get_update_invocation(optimizer_attrs, weight_tensor,
-                              weight_grad_tensor, optimizer_buffer_tensors);
+        get_update_invocation(optimizer_attrs,
+                              weight_tensor_group.forward_tensor,
+                              weight_tensor_group.gradient_tensor,
+                              weight_tensor_group.optimizer_tensors);
 
     // TODO: https://github.com/flexflow/flexflow-train/issues/1442
     // assert(is_invocation_valid(get_update_signature(attrs), invocation));
 
-    // execute update
     TaskArgumentAccessor accessor = get_task_arg_accessor(
-        realm_training_backing.realm_tensor_backing,
-        realm_training_backing.realm_args_backing, invocation,
-        realm_training_backing.allocators[0]);
+        local_training_backing.local_tensor_backing,
+        local_training_backing.local_args_backing.runtime_arg_config,
+        invocation,
+        runtime_state.allocators[0]);
+    TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs);
+
     task_id_t task_id = invocation.task_id;
-    register_wrapper_tasks_generic(0, realm_training_backing.worker_procs[0],
+    register_wrapper_tasks_generic(0, runtime_state.worker_procs[0],
                                    task_id);
-    TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs);
     // TODO: multi gpu launching
     Promise<void> promise;
     Future<void> future = promise.get_future();
     RealmTaskArgs<void>* task_arg = new RealmTaskArgs<void>{task_id, update_impl_fn, accessor,
                                         std::move(promise)};
     uintptr_t args[1] = {reinterpret_cast<uintptr_t>(task_arg)};
-    Event e = realm_training_backing.worker_procs[0].spawn(
+    Event e = runtime_state.worker_procs[0].spawn(
         get_realm_task_id(task_id), args, sizeof(uintptr_t),
-        realm_training_backing.worker_events[0]);
-    realm_training_backing.worker_events[0] = e;
+        runtime_state.worker_events[0]);
+    runtime_state.worker_events[0] = e;
     future.set_event(e);
     return future;
-  } else {
-    return Future<void>();
   }
 }
 
-Future<void> compute_loss(RealmTrainingBacking &realm_training_backing,
-                          LossAttrs const &loss_attrs,
-                          tensor_guid_t const &logit_tensor,
-                          loss_tensor_t const &label_tensor) {
+Future<void> compute_loss(LocalTrainingBacking const &local_training_backing,
+                  LossAttrs const &loss_attrs,
+                  RealmRuntimeState &runtime_state) {
+
+  TrainingComputationGraph training_cg =
+      local_training_backing.training_computation_graph;
+  tensor_guid_t logit_tensor = training_cg.logit_tensor;
+  loss_tensor_guid_t label_tensor = training_cg.label_tensor;
+
   TaskInvocation loss_invocation = backward(
-      loss_attrs, logit_tensor,
-      realm_training_backing.realm_tensor_backing.tensor_gradient_mapping.at(
-          logit_tensor),
+      loss_attrs,
+      get_forward_tensor_guid_for_tensor_guid(training_cg, logit_tensor),
+      get_gradient_tensor_guid_for_tensor_guid(training_cg, logit_tensor),
       label_tensor);
   // TODO: https://github.com/flexflow/flexflow-train/issues/1442
   // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation));
   TaskArgumentAccessor loss_accessor = get_task_arg_accessor(
-      realm_training_backing.realm_tensor_backing,
-      realm_training_backing.realm_args_backing, loss_invocation,
-        realm_training_backing.allocators[0]);
-  task_id_t task_id = loss_invocation.task_id;
-  register_wrapper_tasks_generic(0, realm_training_backing.worker_procs[0],
-                                 task_id);
+      local_training_backing.local_tensor_backing,
+      local_training_backing.local_args_backing.runtime_arg_config,
+      loss_invocation,
+      runtime_state.allocators[0]);
   TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl();
+
+  task_id_t task_id = loss_invocation.task_id;
+  register_wrapper_tasks_generic(0, runtime_state.worker_procs[0],
+                                task_id);
   // TODO: multi gpu launching
   Promise<void> promise;
   Future<void> future = promise.get_future();
   RealmTaskArgs<void>* task_arg = new RealmTaskArgs<void>{task_id, loss_impl_fn, loss_accessor,
                                         std::move(promise)};
   uintptr_t args[1] = {reinterpret_cast<uintptr_t>(task_arg)};
-  Event e = realm_training_backing.worker_procs[0].spawn(
+  Event e = runtime_state.worker_procs[0].spawn(
       get_realm_task_id(task_id), args, sizeof(uintptr_t),
-      realm_training_backing.worker_events[0]);
-  realm_training_backing.worker_events[0] = e;
+      runtime_state.worker_events[0]);
+  runtime_state.worker_events[0] = e;
   future.set_event(e);
   return future;
 }
 
-TaskArgumentAccessor
-get_task_arg_accessor(RealmTensorBacking const &realm_tensor_backing,
-                      RealmArgsBacking const &realm_args_backing,
-                      TaskInvocation const &invocation,
-                      Allocator &allocator) {
-  TensorSlotsBacking tensor_slots_backing =
-      construct_tensor_slots_backing(realm_tensor_backing, invocation.binding);
-  ArgSlotsBacking arg_slots_backing = construct_arg_slots_backing(
-      invocation.binding, realm_args_backing.runtime_arg_config);
-  // TODO: multi gpu
-  return TaskArgumentAccessor::create<RealmTaskArgumentAccessor>(
-      allocator, tensor_slots_backing, arg_slots_backing);
-}
-
 } // namespace FlexFlow
diff --git a/lib/realm-backend/src/task_wrapper.cc b/lib/realm-backend/src/task_wrapper.cc
index cb220f44dc..b81494dce4 100644
--- a/lib/realm-backend/src/task_wrapper.cc
+++ b/lib/realm-backend/src/task_wrapper.cc
@@ -26,12 +26,13 @@ void fwdbwd_wrapper_task(const void *args, size_t arglen, const void *userdata,
                          size_t userlen, Processor p) {
   assert(arglen == sizeof(uintptr_t));
   uintptr_t task_arg_ptr = *reinterpret_cast<const uintptr_t *>(args);
-  RealmTaskArgs<float> *task_args =
-      reinterpret_cast<RealmTaskArgs<float> *>(task_arg_ptr);
+  RealmTaskArgs<std::optional<milliseconds_t>> *task_args =
+      reinterpret_cast<RealmTaskArgs<std::optional<milliseconds_t>> *>(task_arg_ptr);
   auto fn =
       task_args->impl_function.get<FwdBwdOpTaskImplFunction>().function_ptr;
-  std::optional<float> result = fn(task_args->accessor);
-  task_args->promise.set_value(result.has_value() ? result.value() : 0.0f);
+  std::optional<milliseconds_t> result = transform(
+      fn(task_args->accessor), [](float running_time) { return milliseconds_t{running_time}; });
+  task_args->promise.set_value(std::move(result));
   delete task_args;
 }
 
diff --git a/lib/realm-backend/test/src/test_e2e.cc b/lib/realm-backend/test/src/test_e2e.cc
index fa0976991d..66ff034240 100644
--- a/lib/realm-backend/test/src/test_e2e.cc
+++ b/lib/realm-backend/test/src/test_e2e.cc
@@ -1,17 +1,25 @@
+#include "test_utils.h"
 #include "kernels/compare_tensor_accessors.h"
+#include "kernels/copy_tensor_accessor.h"
 #include "kernels/format_accessor_contents.h"
+#include "kernels/local_cpu_allocator.h"
+#include "kernels/local_cuda_allocator.h"
+#include "kernels/managed_ff_stream.h"
+#include "kernels/managed_per_device_ff_handle.h"
 #include "kernels/tensor_accessor_reductions.h"
-#include "kernels/test_utils.h"
-#include "local-execution/allocated_tensors.h"
-#include "realm-backend/realm_allocator.h"
 #include "realm-backend/realm_training_backing.h"
+#include "realm-backend/model_training_instance.h"
 #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h"
 #include "pcg/computation_graph.h"
 #include "pcg/computation_graph_builder.h"
 #include "pcg/optimizer_attrs.dtg.h"
-#include "test_utils.h"
+#include "task-spec/forward_tensor_source.h"
+#include "task-spec/gradient_tensor_source.h"
+#include "task-spec/loss_tensor_source.h"
+#include "task-spec/optimizer_tensor_source.h"
+#include "task-spec/runtime_arg_config.h"
+#include "task-spec/training_computation_graph.h"
 #include "utils/containers/get_only.h"
-#include "realm-backend/model_training_instance.h"
 
 using namespace ::FlexFlow;
 using namespace Realm;
@@ -26,158 +34,166 @@ bool did_loss_decrease(GenericTensorAccessorR const &first_epoch,
 
 void top_level_task(const void *args, size_t arglen, const void *userdata,
                     size_t userlen, Realm::Processor p) {
-  // initialize runtime
-  ManagedFFStream managed_stream{};
-  ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
-      /*workSpaceSize=*/1024 * 1024,
-      /*allowTensorOpMathConversion=*/true);
-  std::vector<Processor> worker_procs;
-  std::vector<Allocator> allocators;
-  Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine())
-                                  .only_kind(Processor::TOC_PROC);
-  assert(pq.count() > 0);
-  for (Processor p : pq) {
-    worker_procs.push_back(p);
-    allocators.push_back(create_realm_memory_allocator(p));
-  }
-
-  // allocate label tensors
-  LossTensorSource loss_tensor_source;
-  loss_tensor_t label_tensor = loss_tensor_source.new_loss_tensor();
-
-  positive_int batch_size = 10_p;
-  positive_int data_dim = 16_p;
-  positive_int hidden_dim = 32_p;
-  positive_int output_dim = 1_p;
-
-  TensorShape input_tensor_shape = TensorShape{
-      TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
-  TensorShape output_tensor_shape = TensorShape{
-      TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
-
-  GenericTensorAccessorW label_tensor_backing = create_random_filled_accessor_w(
-      output_tensor_shape, allocators[0]);
-
-  // construct computation graph
-  ComputationGraph computation_graph = make_empty_computation_graph();
-
-  TensorShape weight_shape_1 = TensorShape{
-      TensorDims{FFOrdered{data_dim, hidden_dim}}, DataType::FLOAT};
-  TensorShape weight_shape_2 = TensorShape{
-      TensorDims{FFOrdered{hidden_dim, output_dim}}, DataType::FLOAT};
-
-  GenericTensorAccessorW weight_1_backing = create_random_filled_accessor_w(
-      weight_shape_1, allocators[0]);
-  GenericTensorAccessorW weight_2_backing = create_random_filled_accessor_w(
-      weight_shape_2, allocators[0]);
-
-  LayerAddedResult inputs_layer =
-      add_input_layer_with_grad(computation_graph, input_tensor_shape);
-  tensor_guid_t input_tensor_guid = get_only(inputs_layer.outputs);
-  GenericTensorAccessorW input_tensor_backing = create_random_filled_accessor_w(
-      input_tensor_shape, allocators[0]);
-
-  LayerAddedResult weights_layer_1 = add_layer(
-      computation_graph,
-      LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
-                      weight_shape_1, InitializerAttrs{GlorotNormalAttrs{0}}}},
-                  std::nullopt},
-      {},
-      {});
-  tensor_guid_t weight_1_tensor_guid = get_only(weights_layer_1.outputs);
-
-  LayerAddedResult weights_layer_2 = add_layer(
-      computation_graph,
-      LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
-                      weight_shape_2, InitializerAttrs{GlorotNormalAttrs{0}}}},
-                  std::nullopt},
-      {},
-      {});
-  tensor_guid_t weight_2_tensor_guid = get_only(weights_layer_2.outputs);
-
-  LayerAddedResult linear_operator_1 = add_layer(
-      computation_graph,
-      LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{hidden_dim,
-                                                      /*use_bias=*/false,
-                                                      DataType::FLOAT,
-                                                      Activation::RELU,
-                                                      std::nullopt}},
-                  std::nullopt},
-      inputs_layer.outputs,
-      weights_layer_1.outputs);
-
-  LayerAddedResult linear_operator_2 = add_layer(
-      computation_graph,
-      LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim,
-                                                      /*use_bias=*/false,
-                                                      DataType::FLOAT,
-                                                      Activation::RELU,
-                                                      std::nullopt}},
-                  std::nullopt},
-      linear_operator_1.outputs,
-      weights_layer_2.outputs);
-
-  tensor_guid_t logit_tensor = get_only(linear_operator_2.outputs);
-
-  RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
-      DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
-      EnableProfiling::YES,
-      ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}};
-
-  // initialize training backing
-  LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
-  OptimizerAttrs optimizer_attrs =
-      OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
-                                        /*momentum=*/0.9,
-                                        /*nesterov=*/false,
-                                        /*weight_decay=*/0.001}};
-
-
-  GradientTensorSource gradient_tensor_source;
-  OptimizerTensorSource optimizer_tensor_source;
-
-  AllocatedTensors allocated_tensors = AllocatedTensors{
-      /*tensor_type_backings=*/{
-          {TensorTypeVariant{label_tensor}, label_tensor_backing},
-          {TensorTypeVariant{input_tensor_guid}, input_tensor_backing},
-          {TensorTypeVariant{weight_1_tensor_guid}, weight_1_backing},
-          {TensorTypeVariant{weight_2_tensor_guid}, weight_2_backing},
-      },
-      /*gradient_mapping=*/{},
-      /*optimizer_mapping*/ {},
-  };
-
-  {
-    printf("\nRunning test %d: E2ETest...\n", 1);
-    RealmTrainingBacking realm_training_backing = RealmTrainingBacking(
-        p, worker_procs, allocators, allocated_tensors, gradient_tensor_source,
-        optimizer_tensor_source, computation_graph, runtime_arg_config,
-        optimizer_attrs);
-    // begin training loop                      
-    ModelTrainingInstance model_training_instance = ModelTrainingInstance{
-      realm_training_backing, logit_tensor, label_tensor, loss_attrs, optimizer_attrs
+    // initialize runtime
+    ManagedFFStream managed_stream{};
+    ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(
+        /*workSpaceSize=*/1024 * 1024,
+        /*allowTensorOpMathConversion=*/true);
+
+    Memory master_mem = Machine::MemoryQuery(Machine::get_machine())
+                            .only_kind(Memory::SYSTEM_MEM)
+                            .best_affinity_to(p)
+                            .first();
+    std::vector<Processor> worker_procs;
+    std::vector<Event> worker_events;
+    std::vector<Allocator> allocators;
+    Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine())
+                                    .only_kind(Processor::TOC_PROC);
+    assert(pq.count() > 0);
+    for (Processor p : pq) {
+        worker_procs.push_back(p);
+        worker_events.push_back(Event::NO_EVENT);
+        allocators.push_back(create_realm_memory_allocator(p));
+    }
+    RealmRuntimeState runtime_state = RealmRuntimeState{
+        p, Event::NO_EVENT, master_mem, worker_procs, worker_events, allocators};
+
+    positive_int batch_size = 10_p;
+    positive_int data_dim = 16_p;
+    positive_int hidden_dim = 32_p;
+    positive_int output_dim = 1_p;
+
+    TensorShape output_tensor_shape = TensorShape{
+        TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT};
+
+    // TODO: multi gpu
+    GenericTensorAccessorW label_tensor_backing =
+        runtime_state.allocators[0].allocate_tensor(output_tensor_shape);
+
+    // construct computation graph
+    ComputationGraph computation_graph = make_empty_computation_graph();
+
+    TensorShape input_tensor_shape = TensorShape{
+        TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT};
+
+    TensorShape weight_shape_1 = TensorShape{
+        TensorDims{FFOrdered{data_dim, hidden_dim}}, DataType::FLOAT};
+    TensorShape weight_shape_2 = TensorShape{
+        TensorDims{FFOrdered{hidden_dim, output_dim}}, DataType::FLOAT};
+
+    LayerAddedResult inputs_layer =
+        add_input_layer_with_grad(computation_graph, input_tensor_shape);
+
+    LayerAddedResult weights_layer_1 = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
+                       weight_shape_1, InitializerAttrs{GlorotNormalAttrs{0}}}},
+                   std::nullopt},
+        {},
+        {});
+
+    LayerAddedResult weights_layer_2 = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
+                       weight_shape_2, InitializerAttrs{GlorotNormalAttrs{0}}}},
+                   std::nullopt},
+        {},
+        {});
+
+    LayerAddedResult linear_operator_1 = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{hidden_dim,
+                                                       /*use_bias=*/false,
+                                                       DataType::FLOAT,
+                                                       Activation::RELU,
+                                                       std::nullopt}},
+                   std::nullopt},
+        inputs_layer.outputs,
+        weights_layer_1.outputs);
+
+    LayerAddedResult linear_operator_2 = add_layer(
+        computation_graph,
+        LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim,
+                                                       /*use_bias=*/false,
+                                                       DataType::FLOAT,
+                                                       Activation::RELU,
+                                                       std::nullopt}},
+                   std::nullopt},
+        linear_operator_1.outputs,
+        weights_layer_2.outputs);
+
+    tensor_guid_t logit_tensor = get_only(linear_operator_2.outputs);
+
+    RuntimeArgConfig runtime_arg_config = gpu_make_runtime_arg_config(
+        managed_handle.raw_handle(),
+        EnableProfiling::YES,
+        ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1});
+
+    // initialize training backing
+    LossAttrs loss_attrs = LossAttrs{
+        NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}};
+    OptimizerAttrs optimizer_attrs = OptimizerAttrs{
+        SGDOptimizerAttrs{
+            /*lr=*/0.001,
+            /*momentum=*/0.9,
+            /*nesterov=*/false,
+            /*weight_decay=*/0.001,
+        },
     };
 
-    Allocator cpu_allocator = create_local_cpu_memory_allocator();
-
-    int num_epochs = 5;
-    std::vector<GenericTensorAccessorR> loss_values;
-
-    for (int i = 0; i < num_epochs; i++) {
-      model_training_instance.forward();
-      model_training_instance.backward();
-      model_training_instance.update();
-      loss_values.push_back(copy_tensor_accessor_r(
-          model_training_instance.get_loss_tensor_accessor(), cpu_allocator));
+    ForwardTensorSource forward_tensor_source;
+    GradientTensorSource gradient_tensor_source;
+    OptimizerTensorSource optimizer_tensor_source;
+    LossTensorSource loss_tensor_source;
+
+    TrainingComputationGraph training_computation_graph =
+        generate_training_computation_graph(computation_graph,
+                                            optimizer_attrs,
+                                            logit_tensor,
+                                            forward_tensor_source,
+                                            gradient_tensor_source,
+                                            optimizer_tensor_source,
+                                            loss_tensor_source);
+
+    LocalTrainingBacking local_training_backing =
+        make_realm_training_backing_for_computation_graph(
+            /*runtime_state=*/runtime_state,
+            /*preallocated_tensors=*/
+            {
+                {
+                    training_tensor_guid_t{
+                        training_computation_graph.label_tensor},
+                    label_tensor_backing,
+                },
+            },
+            /*training_computation_graph=*/training_computation_graph,
+            /*runtime_arg_config=*/runtime_arg_config,
+            /*optimizer_attrs=*/optimizer_attrs);
+
+    // begin training loop
+    ModelTrainingInstance model_training_instance = ModelTrainingInstance{
+        runtime_state, local_training_backing, loss_attrs, optimizer_attrs};
+
+    {
+        printf("\nRunning test %d: E2ETest...\n", 1);
+        Allocator cpu_allocator = create_local_cpu_memory_allocator();
+
+        int num_epochs = 5;
+        std::vector<GenericTensorAccessorR> loss_values;
+
+        for (int i = 0; i < num_epochs; i++) {
+        model_training_instance.forward();
+        model_training_instance.backward();
+        model_training_instance.update();
+        loss_values.push_back(copy_tensor_accessor_r(
+            model_training_instance.get_loss_tensor_accessor(), cpu_allocator));
+        }
+
+        // Assert that each sample in the batch has a lower loss in last epoch than
+        // the first epoch
+        GenericTensorAccessorR first_epoch_loss = loss_values.at(0);
+        GenericTensorAccessorR last_epoch = loss_values.back();
+        assert(did_loss_decrease(first_epoch_loss, last_epoch));
+        printf("passed\n");
     }
-
-    // Assert that each sample in the batch has a lower loss in last epoch than
-    // the first epoch
-    GenericTensorAccessorR first_epoch_loss = loss_values.at(0);
-    
-    GenericTensorAccessorR last_epoch = loss_values.back();
-
-    assert(did_loss_decrease(first_epoch_loss, last_epoch));
-    printf("passed\n");
-  }
 }
diff --git a/lib/realm-backend/test/src/test_update.cc b/lib/realm-backend/test/src/test_update.cc
index b1f6bebe74..cd7119271d 100644
--- a/lib/realm-backend/test/src/test_update.cc
+++ b/lib/realm-backend/test/src/test_update.cc
@@ -6,7 +6,7 @@
 #include "pcg/optimizer_attrs.dtg.h"
 #include "realm-backend/driver.h"
 #include "realm-backend/realm_allocator.h"
-#include "realm-backend/realm_training_backing.h"
+#include "realm-backend/local_training_backing.h"
 #include "test_utils.h"
 
 using namespace ::FlexFlow;
@@ -80,11 +80,11 @@ void top_level_task(const void *args, size_t arglen, const void *userdata,
                                         /*momentum=*/0.0f,
                                         /*nesterov=*/false,
                                         /*weight_decay=*/0.001}};
-    RealmTrainingBacking realm_training_backing = RealmTrainingBacking(
+    LocalTrainingBacking local_training_backing = LocalTrainingBacking(
         p, worker_procs, allocators, allocated_tensors, gradient_tensor_source,
         optimizer_tensor_source, computation_graph, runtime_arg_config,
         optimizer_attrs);
-    execute_update(realm_training_backing, linear_operator.layer, optimizer_attrs).wait();
+    execute_update(local_training_backing, linear_operator.layer, optimizer_attrs).wait();
     printf("passed\n");
   }
 
@@ -95,11 +95,11 @@ void top_level_task(const void *args, size_t arglen, const void *userdata,
                                         /*momentum=*/0.9,
                                         /*nesterov=*/false,
                                         /*weight_decay=*/0.001}};
-    RealmTrainingBacking realm_training_backing = RealmTrainingBacking(
+    LocalTrainingBacking local_training_backing = LocalTrainingBacking(
         p, worker_procs, allocators, allocated_tensors, gradient_tensor_source,
         optimizer_tensor_source, computation_graph, runtime_arg_config,
         optimizer_attrs);
-    execute_update(realm_training_backing, linear_operator.layer, optimizer_attrs).wait();
+    execute_update(local_training_backing, linear_operator.layer, optimizer_attrs).wait();
     printf("passed\n");
   }
   
@@ -114,11 +114,11 @@ void top_level_task(const void *args, size_t arglen, const void *userdata,
                                         /*beta_t=*/0.9,
                                         /*beta2_t=*/0.999,
                                         /*epsilon=*/1e-8}};
-    RealmTrainingBacking realm_training_backing = RealmTrainingBacking(
+    LocalTrainingBacking local_training_backing = LocalTrainingBacking(
         p, worker_procs, allocators, allocated_tensors, gradient_tensor_source,
         optimizer_tensor_source, computation_graph, runtime_arg_config,
         optimizer_attrs);
-    execute_update(realm_training_backing, linear_operator.layer, optimizer_attrs).wait();
+    execute_update(local_training_backing, linear_operator.layer, optimizer_attrs).wait();
     printf("passed\n");
   }
 }
diff --git a/lib/runtime/src/ops/embedding.cc b/lib/runtime/src/ops/embedding.cc
deleted file mode 100644
index 83e7c15460..0000000000
--- a/lib/runtime/src/ops/embedding.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "embedding.h"
-#include "kernels/embedding_kernels.h"
-#include "op-attrs/get_output_shapes.h"
-#include "op-attrs/ops/embedding.h"
-
-namespace FlexFlow {
-
-using namespace FlexFlow::Kernels::Embedding;
-
-enum Slots { INPUT, WEIGHT, OUTPUT, ATTRS, PROFILING };
-
-OpTaskInvocation forward(EmbeddingAttrs const &attrs) {
-  OpTaskBinding b;
-
-  b.bind(INPUT, input_tensor(0));
-  b.bind(WEIGHT, weight_tensor(0));
-  b.bind(OUTPUT, output_tensor(0));
-
-  b.bind_arg(ATTRS, attrs);
-  b.bind_arg(PROFILING, profiling_settings());
-
-  return {EMBED_FWD_TASK_ID, b};
-}
-
-OpTaskInvocation backward(EmbeddingAttrs const &attrs) {
-  OpTaskBinding b = infer_bwd_binding(forward(attrs).binding);
-
-  return {EMBED_BWD_TASK_ID, b};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  EmbeddingAttrs attrs = acc.get_argument<EmbeddingAttrs>(ATTRS);
-
-  return profile(forward_kernel,
-                 profiling,
-                 "[Embedding] forward_time = {:.2lf}ms\n",
-                 input,
-                 output,
-                 weight,
-                 input.data_type,
-                 output.data_type,
-                 attrs.aggr,
-                 input.shape.get_dim(),
-                 output.shape.get_dim(),
-                 input.shape[legion_dim_t(1)]);
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
-  auto weight_grad = acc.get_tensor_grad<Permissions::RW>(WEIGHT);
-
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  EmbeddingAttrs attrs = acc.get_argument<EmbeddingAttrs>(ATTRS);
-
-  return profile(backward_kernel,
-                 profiling,
-                 "[Embedding] backward_time = {:.2lf}ms\n",
-                 output,
-                 input,
-                 weight_grad,
-                 output.data_type,
-                 input.data_type,
-                 attrs.aggr,
-                 input.shape.get_dim(),
-                 output.shape.get_dim(),
-                 input.shape.at(ff_dim_t{nonnegative_int{0}}));
-}
-
-TaskImplFunction get_embedding_fwd_task_impl() {
-  return forward_task_impl;
-}
-TaskImplFunction get_embedding_bwd_task_impl() {
-  return backward_task_impl;
-}
-
-OpTaskSignature get_embedding_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_input_slot(INPUT);
-  fwd.add_input_slot(OUTPUT);
-  fwd.add_input_slot(WEIGHT);
-
-  fwd.add_arg_slot<EmbeddingAttrs>(ATTRS);
-  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
-
-  return fwd;
-}
-
-OpTaskSignature get_embedding_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_embedding_fwd_signature());
-  return bwd;
-}
-
-std::vector<task_id_t> get_task_ids(EmbeddingAttrs const &) {
-  return {EMBED_FWD_TASK_ID, EMBED_BWD_TASK_ID};
-}
-
-} // namespace FlexFlow
diff --git a/lib/task-spec/include/task-spec/concrete_arg.h b/lib/task-spec/include/task-spec/concrete_arg_spec.h
similarity index 89%
rename from lib/task-spec/include/task-spec/concrete_arg.h
rename to lib/task-spec/include/task-spec/concrete_arg_spec.h
index 7b2ece59a7..24a96e9f78 100644
--- a/lib/task-spec/include/task-spec/concrete_arg.h
+++ b/lib/task-spec/include/task-spec/concrete_arg_spec.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_CONCRETE_ARG_H
-#define _FLEXFLOW_LOCAL_EXECUTION_CONCRETE_ARG_H
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_CONCRETE_ARG_SPEC_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_CONCRETE_ARG_SPEC_H
 
 #include "fmt/format.h"
 #include "task-spec/serialization.h"
@@ -15,7 +15,7 @@ struct ConcreteArgSpec {
 
   template <typename T>
   T const &get() const {
-    assert(matches<T>(this->type_idx));
+    ASSERT(matches<T>(this->type_idx), this->type_idx.name());
 
     return *(T const *)ptr.get();
   }
diff --git a/lib/task-spec/include/task-spec/device_specific_device_states.variant.toml b/lib/task-spec/include/task-spec/device_specific_device_states.variant.toml
index 944dddc3df..b77850c50d 100644
--- a/lib/task-spec/include/task-spec/device_specific_device_states.variant.toml
+++ b/lib/task-spec/include/task-spec/device_specific_device_states.variant.toml
@@ -5,82 +5,71 @@ features = [
 ]
 
 includes = [
-  "kernels/attention_kernels.h",
+  "kernels/mha_per_device_state.dtg.h",
   "kernels/batch_norm_per_device_state.dtg.h",
-  "kernels/conv_2d_kernels.h",
-  "kernels/dropout_kernels.h",
-  "kernels/element_binary_kernels.h",
-  "kernels/element_unary_kernels.h",
-  "kernels/gather_kernels.h",
-  "kernels/layer_norm_kernels.h",
-  "kernels/linear_kernels.h",
-  "kernels/partition_kernels.h",
-  "kernels/pool_2d_kernels.h",
-  "kernels/reduce_kernels.h",
-  "kernels/reduction_kernels.h",
-  "kernels/reshape_kernels.h",
-  "kernels/softmax_kernels.h",
-  "kernels/topk_kernels.h",
-  "kernels/transpose_kernels.h",
+  "kernels/conv_2d_per_device_state.dtg.h",
+  "kernels/dropout_per_device_state.dtg.h",
+  "kernels/element_binary_per_device_state.dtg.h",
+  "kernels/element_unary_per_device_state.dtg.h",
+  "kernels/gather_per_device_state.dtg.h",
+  "kernels/layer_norm_per_device_state.dtg.h",
+  "kernels/linear_per_device_state.dtg.h",
+  "kernels/partition_per_device_state.dtg.h",
+  "kernels/pool_2d_per_device_state.dtg.h",
+  "kernels/reduce_per_device_state.dtg.h",
+  "kernels/softmax_per_device_state.dtg.h",
   "task-spec/device_specific.h",
+  "<optional>",
 ]
 
 [[values]]
-type = "::FlexFlow::DeviceSpecific<::FlexFlow::MHAPerDeviceState>"
+type = "::FlexFlow::DeviceSpecific<std::optional<::FlexFlow::MHAPerDeviceState>>"
 key = "device_specific_mha_per_device_state"
 
 [[values]]
-type = "::FlexFlow::DeviceSpecific<::FlexFlow::BatchNormPerDeviceState>"
+type = "::FlexFlow::DeviceSpecific<std::optional<::FlexFlow::BatchNormPerDeviceState>>"
 key = "device_specific_batch_norm_per_device_state"
 
 [[values]]
-type = "::FlexFlow::DeviceSpecific<::FlexFlow::Conv2DPerDeviceState>"
+type = "::FlexFlow::DeviceSpecific<std::optional<::FlexFlow::Conv2DPerDeviceState>>"
 key = "device_specific_conv2d_per_device_state"
 
 [[values]]
-type = "::FlexFlow::DeviceSpecific<::FlexFlow::DropoutPerDeviceState>"
+type = "::FlexFlow::DeviceSpecific<std::optional<::FlexFlow::DropoutPerDeviceState>>"
 key = "device_specific_dropout_per_device_state"
 
 [[values]]
-type = "::FlexFlow::DeviceSpecific<::FlexFlow::ElementBinaryPerDeviceState>"
+type = "::FlexFlow::DeviceSpecific<std::optional<::FlexFlow::ElementBinaryPerDeviceState>>"
 key = "device_specific_element_binary_per_device_state"
 
 [[values]]
-type = "::FlexFlow::DeviceSpecific<::FlexFlow::ElementUnaryPerDeviceState>"
+type = "::FlexFlow::DeviceSpecific<std::optional<::FlexFlow::ElementUnaryPerDeviceState>>"
 key = "device_specific_element_unary_per_device_state"
 
 [[values]]
-type = "::FlexFlow::DeviceSpecific<::FlexFlow::GatherPerDeviceState>"
+type = "::FlexFlow::DeviceSpecific<std::optional<::FlexFlow::GatherPerDeviceState>>"
 key = "device_specific_gather_per_device_state"
 
 [[values]]
-type = "::FlexFlow::DeviceSpecific<::FlexFlow::LayerNormPerDeviceState>"
+type = "::FlexFlow::DeviceSpecific<std::optional<::FlexFlow::LayerNormPerDeviceState>>"
 key = "device_specific_layer_norm_per_device_state"
 
 [[values]]
-type = "::FlexFlow::DeviceSpecific<::FlexFlow::LinearPerDeviceState>"
+type = "::FlexFlow::DeviceSpecific<std::optional<::FlexFlow::LinearPerDeviceState>>"
 key = "device_specific_linear_per_device_state"
 
 [[values]]
-type = "::FlexFlow::DeviceSpecific<::FlexFlow::Pool2DPerDeviceState>"
+type = "::FlexFlow::DeviceSpecific<std::optional<::FlexFlow::Pool2DPerDeviceState>>"
 key = "device_specific_pool_2d_per_device_state"
 
 [[values]]
-type = "::FlexFlow::DeviceSpecific<::FlexFlow::ReducePerDeviceState>"
+type = "::FlexFlow::DeviceSpecific<std::optional<::FlexFlow::ReducePerDeviceState>>"
 key = "device_specific_reduce_per_device_state"
 
 [[values]]
-type = "::FlexFlow::DeviceSpecific<::FlexFlow::RepartitionPerDeviceState>"
+type = "::FlexFlow::DeviceSpecific<std::optional<::FlexFlow::RepartitionPerDeviceState>>"
 key = "device_specific_repartition_per_device_state"
 
 [[values]]
-type = "::FlexFlow::DeviceSpecific<::FlexFlow::ReshapePerDeviceState>"
-key = "device_specific_reshape_per_device_state"
-
-[[values]]
-type = "::FlexFlow::DeviceSpecific<::FlexFlow::SoftmaxPerDeviceState>"
+type = "::FlexFlow::DeviceSpecific<std::optional<::FlexFlow::SoftmaxPerDeviceState>>"
 key = "device_specific_softmax_per_device_state"
-
-[[values]]
-type = "::FlexFlow::DeviceSpecific<::FlexFlow::TopKPerDeviceState>"
-key = "device_specific_topk_per_device_state"
diff --git a/lib/task-spec/include/task-spec/optimizer_tensor_t.struct.toml b/lib/task-spec/include/task-spec/forward_tensor_guid_t.struct.toml
similarity index 79%
rename from lib/task-spec/include/task-spec/optimizer_tensor_t.struct.toml
rename to lib/task-spec/include/task-spec/forward_tensor_guid_t.struct.toml
index 5d3e05f673..68fc4b6815 100644
--- a/lib/task-spec/include/task-spec/optimizer_tensor_t.struct.toml
+++ b/lib/task-spec/include/task-spec/forward_tensor_guid_t.struct.toml
@@ -1,5 +1,5 @@
 namespace = "FlexFlow"
-name = "optimizer_tensor_t"
+name = "forward_tensor_guid_t"
 features = [
   "eq",
   "ord",
diff --git a/lib/task-spec/include/task-spec/forward_tensor_source.h b/lib/task-spec/include/task-spec/forward_tensor_source.h
new file mode 100644
index 0000000000..7adde6e145
--- /dev/null
+++ b/lib/task-spec/include/task-spec/forward_tensor_source.h
@@ -0,0 +1,22 @@
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_FORWARD_TENSOR_SOURCE_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_FORWARD_TENSOR_SOURCE_H
+
+#include "task-spec/forward_tensor_guid_t.dtg.h"
+
+namespace FlexFlow {
+
+struct ForwardTensorSource {
+public:
+  ForwardTensorSource();
+
+  forward_tensor_guid_t new_forward_tensor();
+
+  void reset();
+
+private:
+  static int next_available_forward_tensor_id;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/task-spec/include/task-spec/gradient_tensor_t.struct.toml b/lib/task-spec/include/task-spec/gradient_tensor_guid_t.struct.toml
similarity index 78%
rename from lib/task-spec/include/task-spec/gradient_tensor_t.struct.toml
rename to lib/task-spec/include/task-spec/gradient_tensor_guid_t.struct.toml
index 5367ccee07..b75e27a9d2 100644
--- a/lib/task-spec/include/task-spec/gradient_tensor_t.struct.toml
+++ b/lib/task-spec/include/task-spec/gradient_tensor_guid_t.struct.toml
@@ -1,5 +1,5 @@
 namespace = "FlexFlow"
-name = "gradient_tensor_t"
+name = "gradient_tensor_guid_t"
 features = [
   "eq",
   "ord",
diff --git a/lib/task-spec/include/task-spec/gradient_tensor_source.h b/lib/task-spec/include/task-spec/gradient_tensor_source.h
new file mode 100644
index 0000000000..14ebf05d43
--- /dev/null
+++ b/lib/task-spec/include/task-spec/gradient_tensor_source.h
@@ -0,0 +1,22 @@
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_GRADIENT_TENSOR_SOURCE_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_GRADIENT_TENSOR_SOURCE_H
+
+#include "task-spec/gradient_tensor_guid_t.dtg.h"
+
+namespace FlexFlow {
+
+struct GradientTensorSource {
+public:
+  GradientTensorSource();
+
+  gradient_tensor_guid_t new_gradient_tensor();
+
+  void reset();
+
+private:
+  static int next_available_gradient_tensor_id;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/task-spec/include/task-spec/init_op_task_impl_function.h b/lib/task-spec/include/task-spec/init_op_task_impl_function.h
index f82d249df1..97daa7ef56 100644
--- a/lib/task-spec/include/task-spec/init_op_task_impl_function.h
+++ b/lib/task-spec/include/task-spec/init_op_task_impl_function.h
@@ -7,15 +7,16 @@
 namespace FlexFlow {
 
 struct InitOpTaskImplFunction {
-
-  DeviceSpecificDeviceStates (*function_ptr)(TaskArgumentAccessor const &);
-
+public:
   bool operator==(InitOpTaskImplFunction const &) const;
   bool operator!=(InitOpTaskImplFunction const &) const;
   bool operator<(InitOpTaskImplFunction const &) const;
   bool operator>(InitOpTaskImplFunction const &) const;
   bool operator<=(InitOpTaskImplFunction const &) const;
   bool operator>=(InitOpTaskImplFunction const &) const;
+
+public:
+  DeviceSpecificDeviceStates (*function_ptr)(TaskArgumentAccessor const &);
 };
 
 std::string format_as(InitOpTaskImplFunction const &x);
diff --git a/lib/task-spec/include/task-spec/itask_argument_accessor.h b/lib/task-spec/include/task-spec/itask_argument_accessor.h
index e7d1a81760..2e693e7983 100644
--- a/lib/task-spec/include/task-spec/itask_argument_accessor.h
+++ b/lib/task-spec/include/task-spec/itask_argument_accessor.h
@@ -2,7 +2,7 @@
 #define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_ITASK_ARGUMENT_ACCESSOR_H
 
 #include "kernels/allocation.h"
-#include "task-spec/concrete_arg.h"
+#include "task-spec/concrete_arg_spec.h"
 #include "task-spec/op_task_signature.h"
 #include "task-spec/privilege_tensor_accessor.h"
 #include "task-spec/tensor_type.dtg.h"
diff --git a/lib/local-execution/include/local-execution/loss_functions.h b/lib/task-spec/include/task-spec/loss_functions.h
similarity index 69%
rename from lib/local-execution/include/local-execution/loss_functions.h
rename to lib/task-spec/include/task-spec/loss_functions.h
index c75d4414de..a5f5886caa 100644
--- a/lib/local-execution/include/local-execution/loss_functions.h
+++ b/lib/task-spec/include/task-spec/loss_functions.h
@@ -13,12 +13,13 @@
  * limitations under the License.
  */
 
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_
-#define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_LOSS_FUNCTIONS_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_LOSS_FUNCTIONS_H
 
 #include "op-attrs/ops/loss_functions.h"
-#include "pcg/tensor_guid_t.dtg.h"
-#include "task-spec/loss_tensor_t.dtg.h"
+#include "task-spec/forward_tensor_guid_t.dtg.h"
+#include "task-spec/gradient_tensor_guid_t.dtg.h"
+#include "task-spec/loss_tensor_guid_t.dtg.h"
 #include "task-spec/task_impl_function.dtg.h"
 #include "task-spec/task_invocation.dtg.h"
 #include "task-spec/task_signature.h"
@@ -28,9 +29,9 @@ namespace FlexFlow {
 TaskImplFunction get_loss_bwd_task_impl();
 TaskSignature get_loss_bwd_signature();
 TaskInvocation backward(LossAttrs const &,
-                        tensor_guid_t logit,
-                        gradient_tensor_t logit_grad,
-                        loss_tensor_t label);
+                        forward_tensor_guid_t logit,
+                        gradient_tensor_guid_t logit_grad,
+                        loss_tensor_guid_t label);
 
 } // namespace FlexFlow
 
diff --git a/lib/task-spec/include/task-spec/loss_tensor_t.struct.toml b/lib/task-spec/include/task-spec/loss_tensor_guid_t.struct.toml
similarity index 87%
rename from lib/task-spec/include/task-spec/loss_tensor_t.struct.toml
rename to lib/task-spec/include/task-spec/loss_tensor_guid_t.struct.toml
index 405385069f..c00ccbb0f2 100644
--- a/lib/task-spec/include/task-spec/loss_tensor_t.struct.toml
+++ b/lib/task-spec/include/task-spec/loss_tensor_guid_t.struct.toml
@@ -1,5 +1,5 @@
 namespace = "FlexFlow"
-name = "loss_tensor_t"
+name = "loss_tensor_guid_t"
 features = [
   "eq",
   "ord",
diff --git a/lib/local-execution/include/local-execution/loss_tensor_source.h b/lib/task-spec/include/task-spec/loss_tensor_source.h
similarity index 50%
rename from lib/local-execution/include/local-execution/loss_tensor_source.h
rename to lib/task-spec/include/task-spec/loss_tensor_source.h
index b794207c7f..21091109e5 100644
--- a/lib/local-execution/include/local-execution/loss_tensor_source.h
+++ b/lib/task-spec/include/task-spec/loss_tensor_source.h
@@ -1,7 +1,7 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_LOSS_TENSOR_SOURCE_H
-#define _FLEXFLOW_LOCAL_EXECUTION_LOSS_TENSOR_SOURCE_H
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_LOSS_TENSOR_SOURCE_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_LOSS_TENSOR_SOURCE_H
 
-#include "task-spec/loss_tensor_t.dtg.h"
+#include "task-spec/loss_tensor_guid_t.dtg.h"
 #include "utils/nonnegative_int/nonnegative_int.h"
 
 namespace FlexFlow {
@@ -10,7 +10,7 @@ struct LossTensorSource {
 public:
   LossTensorSource();
 
-  loss_tensor_t new_loss_tensor();
+  loss_tensor_guid_t new_loss_tensor();
 
 private:
   static nonnegative_int next_available_loss_tensor_id;
diff --git a/lib/task-spec/include/task-spec/op_arg_ref.h b/lib/task-spec/include/task-spec/op_arg_ref.h
index d95573787a..88882abd46 100644
--- a/lib/task-spec/include/task-spec/op_arg_ref.h
+++ b/lib/task-spec/include/task-spec/op_arg_ref.h
@@ -15,14 +15,16 @@ using OpArgRef = ArgRef<OpArgRefType, T>;
 using OpArgRefSpec = ArgRefSpec<OpArgRefType>;
 
 template <typename T>
-OpArgRef<DeviceSpecificDeviceStates> per_device_op_state() {
+OpArgRef<T> per_device_op_state() {
   OpArgRefType op_arg_ref_type = OpArgRefType{PerDeviceOpStateRefType{}};
   static_assert(PerDeviceOpState::IsPartOfPerDeviceOpState_v<T>);
-  ArgRef<OpArgRefType, DeviceSpecificDeviceStates> arg_ref = {op_arg_ref_type};
+  ArgRef<OpArgRefType, T> arg_ref = {op_arg_ref_type};
   return arg_ref;
 }
 
-OpArgRef<ParallelTensorShape> input_parallel_tensor_shape(int idx);
+OpArgRef<ParallelTensorShape> input_parallel_tensor_shape(nonnegative_int idx);
+OpArgRef<ParallelTensorShape> weight_parallel_tensor_shape(nonnegative_int idx);
+OpArgRef<ParallelTensorShape> output_parallel_tensor_shape(nonnegative_int idx);
 
 } // namespace FlexFlow
 
diff --git a/lib/task-spec/include/task-spec/op_arg_spec.variant.toml b/lib/task-spec/include/task-spec/op_arg_spec.variant.toml
index e52e5c914e..a03bc222e8 100644
--- a/lib/task-spec/include/task-spec/op_arg_spec.variant.toml
+++ b/lib/task-spec/include/task-spec/op_arg_spec.variant.toml
@@ -10,7 +10,7 @@ features = [
 ]
 
 includes = [
-  "task-spec/concrete_arg.h",
+  "task-spec/concrete_arg_spec.h",
   "task-spec/op_arg_ref.h",
   "task-spec/runtime_arg_ref.h",
 ]
diff --git a/lib/task-spec/include/task-spec/op_task_binding.h b/lib/task-spec/include/task-spec/op_task_binding.h
new file mode 100644
index 0000000000..bcfea33877
--- /dev/null
+++ b/lib/task-spec/include/task-spec/op_task_binding.h
@@ -0,0 +1,97 @@
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OP_TASK_BINDING_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OP_TASK_BINDING_H
+
+#include "task-spec/op_arg_ref.h"
+#include "task-spec/op_arg_spec.dtg.h"
+#include "task-spec/op_tensor_spec.h"
+#include "task-spec/slot_grad_id.dtg.h"
+#include "task-spec/slot_id_t.dtg.h"
+#include "task-spec/variadic_tensor_ref.h"
+
+namespace FlexFlow {
+
+struct OpTaskBinding {
+  OpTaskBinding() = default;
+
+  void bind(int, VariadicTensorRef<OpTensorSpec> const &);
+  void bind(slot_id_t, VariadicTensorRef<OpTensorSpec> const &);
+
+  void bind(int, OpTensorSpec const &);
+  void bind(slot_id_t, OpTensorSpec const &);
+
+  void bind_grad(int, OpTensorSpec const &);
+  void bind_grad(slot_id_t, OpTensorSpec const &);
+
+  template <typename T>
+  void bind_device_specific_arg(int name, T const &t) {
+    this->bind_device_specific_arg<T>(slot_id_t{name}, t);
+  }
+
+  template <typename T>
+  void bind_device_specific_arg(slot_id_t name, T const &t) {
+    NOT_IMPLEMENTED();
+  }
+
+  template <typename T>
+  void bind_device_specific_arg(int name, OpArgRef<T> const &t) {
+    this->bind_device_specific_arg<T>(slot_id_t{name}, t);
+  }
+
+  template <typename T>
+  void bind_device_specific_arg(slot_id_t name, OpArgRef<T> const &t) {
+    NOT_IMPLEMENTED();
+  }
+
+  template <typename T>
+  void bind_arg(int name, T const &t) {
+    this->bind_arg<T>(slot_id_t{name}, t);
+  }
+
+  template <typename T>
+  void bind_arg(slot_id_t name, T const &t) {
+    this->insert_arg_spec(name, OpArgSpec{ConcreteArgSpec::create(t)});
+  }
+
+  template <typename T>
+  void bind_arg(int name, RuntimeArgRef<T> const &t) {
+    this->bind_arg<T>(slot_id_t{name}, t);
+  }
+
+  template <typename T>
+  void bind_arg(slot_id_t name, RuntimeArgRef<T> const &ref) {
+    this->insert_arg_spec(name, OpArgSpec{RuntimeArgRefSpec::create(ref)});
+  }
+
+  template <typename T>
+  void bind_arg(int name, OpArgRef<T> const &t) {
+    this->bind_arg<T>(slot_id_t{name}, t);
+  }
+
+  template <typename T>
+  void bind_arg(slot_id_t name, OpArgRef<T> const &ref) {
+    this->insert_arg_spec(name, OpArgSpec{OpArgRefSpec::create(ref)});
+  }
+  bool operator==(OpTaskBinding const &other) const;
+  bool operator!=(OpTaskBinding const &other) const;
+
+  std::unordered_map<SlotGradId, OpTensorSpec> const &
+      get_tensor_bindings() const;
+  std::unordered_map<slot_id_t, OpArgSpec> const &get_arg_bindings() const;
+
+  void bind_from_forward(OpTaskBinding const &fwd);
+
+private:
+  std::unordered_map<SlotGradId, OpTensorSpec> tensor_bindings;
+  std::unordered_map<slot_id_t, OpArgSpec> arg_bindings;
+
+private:
+  void insert_arg_spec(slot_id_t name, OpArgSpec const &arg_spec);
+  std::tuple<decltype(tensor_bindings) const &, decltype(arg_bindings) const &>
+      tie() const;
+};
+
+OpTaskBinding infer_bwd_binding(OpTaskBinding const &fwd);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/task-spec/include/task-spec/op_task_invocation.h b/lib/task-spec/include/task-spec/op_task_invocation.h
index cce0a4d6a6..88e9e9bf26 100644
--- a/lib/task-spec/include/task-spec/op_task_invocation.h
+++ b/lib/task-spec/include/task-spec/op_task_invocation.h
@@ -1,118 +1,11 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_INVOCATION_H
 #define _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_INVOCATION_H
 
-#include "kernels/accessor.h"
-#include "task-spec/concrete_arg.h"
-#include "task-spec/is_trainable.dtg.h"
-#include "task-spec/op_arg_ref.h"
-#include "task-spec/op_arg_spec.dtg.h"
+#include "task-spec/op_task_invocation.dtg.h"
 #include "task-spec/op_task_signature.h"
-#include "task-spec/op_tensor_spec.h"
-#include "task-spec/profiling.h"
-#include "task-spec/runtime_arg_ref.h"
-#include "task-spec/slot_grad_id.dtg.h"
-#include "task-spec/task_id_t.dtg.h"
-#include "task-spec/variadic_tensor_ref.h"
-#include <typeindex>
-#include <unordered_map>
-#include <unordered_set>
-#include <variant>
 
 namespace FlexFlow {
 
-struct OpTaskBinding {
-  OpTaskBinding() = default;
-
-  void bind(int, VariadicTensorRef<OpTensorSpec> const &);
-  void bind(slot_id_t, VariadicTensorRef<OpTensorSpec> const &);
-
-  void bind(int, OpTensorSpec const &);
-  void bind(slot_id_t, OpTensorSpec const &);
-
-  void bind_grad(int, OpTensorSpec const &);
-  void bind_grad(slot_id_t, OpTensorSpec const &);
-
-  template <typename T>
-  void bind_device_specific_arg(int name, T const &t) {
-    this->bind_device_specific_arg<T>(slot_id_t{name}, t);
-  }
-
-  template <typename T>
-  void bind_device_specific_arg(slot_id_t name, T const &t) {
-    NOT_IMPLEMENTED();
-  }
-
-  template <typename T>
-  void bind_device_specific_arg(int name, OpArgRef<T> const &t) {
-    this->bind_device_specific_arg<T>(slot_id_t{name}, t);
-  }
-
-  template <typename T>
-  void bind_device_specific_arg(slot_id_t name, OpArgRef<T> const &t) {
-    NOT_IMPLEMENTED();
-  }
-
-  template <typename T>
-  void bind_arg(int name, T const &t) {
-    this->bind_arg<T>(slot_id_t{name}, t);
-  }
-
-  template <typename T>
-  void bind_arg(slot_id_t name, T const &t) {
-    this->insert_arg_spec(name, OpArgSpec{ConcreteArgSpec::create(t)});
-  }
-
-  template <typename T>
-  void bind_arg(int name, RuntimeArgRef<T> const &t) {
-    this->bind_arg<T>(slot_id_t{name}, t);
-  }
-
-  template <typename T>
-  void bind_arg(slot_id_t name, RuntimeArgRef<T> const &ref) {
-    this->insert_arg_spec(name, OpArgSpec{RuntimeArgRefSpec::create(ref)});
-  }
-
-  template <typename T>
-  void bind_arg(int name, OpArgRef<T> const &t) {
-    this->bind_arg<T>(slot_id_t{name}, t);
-  }
-
-  template <typename T>
-  void bind_arg(slot_id_t name, OpArgRef<T> const &ref) {
-    this->insert_arg_spec(name, OpArgSpec{OpArgRefSpec::create(ref)});
-  }
-  bool operator==(OpTaskBinding const &other) const;
-  bool operator!=(OpTaskBinding const &other) const;
-
-  std::unordered_map<SlotGradId, OpTensorSpec> const &
-      get_tensor_bindings() const;
-  std::unordered_map<slot_id_t, OpArgSpec> const &get_arg_bindings() const;
-
-  void bind_from_forward(OpTaskBinding const &fwd);
-
-private:
-  std::unordered_map<SlotGradId, OpTensorSpec> tensor_bindings;
-  std::unordered_map<slot_id_t, OpArgSpec> arg_bindings;
-
-private:
-  void insert_arg_spec(slot_id_t name, OpArgSpec const &arg_spec);
-  std::tuple<decltype(tensor_bindings) const &, decltype(arg_bindings) const &>
-      tie() const;
-};
-
-struct OpTaskInvocation {
-public:
-  OpTaskInvocation() = delete;
-  OpTaskInvocation(task_id_t task_id, OpTaskBinding const &binding)
-      : task_id(task_id), binding(binding) {}
-
-public:
-  task_id_t task_id;
-  OpTaskBinding binding;
-};
-
-OpTaskBinding infer_bwd_binding(OpTaskBinding const &fwd);
-
 bool is_invocation_valid(OpTaskSignature const &sig,
                          OpTaskInvocation const &inv);
 
diff --git a/lib/task-spec/include/task-spec/op_task_invocation.struct.toml b/lib/task-spec/include/task-spec/op_task_invocation.struct.toml
new file mode 100644
index 0000000000..465fa5f1ff
--- /dev/null
+++ b/lib/task-spec/include/task-spec/op_task_invocation.struct.toml
@@ -0,0 +1,16 @@
+namespace = "FlexFlow"
+name = "OpTaskInvocation"
+features = []
+
+includes = [
+  "task-spec/op_task_binding.h",
+  "task-spec/task_id_t.dtg.h",
+]
+
+[[fields]]
+name = "task_id"
+type = "::FlexFlow::task_id_t"
+
+[[fields]]
+name = "binding"
+type = "::FlexFlow::OpTaskBinding"
diff --git a/lib/task-spec/include/task-spec/op_task_to_task_invocation.h b/lib/task-spec/include/task-spec/op_task_to_task_invocation.h
index 68c7f05d77..3208e9d049 100644
--- a/lib/task-spec/include/task-spec/op_task_to_task_invocation.h
+++ b/lib/task-spec/include/task-spec/op_task_to_task_invocation.h
@@ -1,31 +1,42 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_TO_TASK_INVOCATION_H
 #define _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_TO_TASK_INVOCATION_H
 
+#include "pcg/cg_operator_tensor_shape_signature.dtg.h"
 #include "pcg/computation_graph.dtg.h"
 #include "pcg/layer_guid_t.dtg.h"
 #include "task-spec/device_specific_device_states.dtg.h"
 #include "task-spec/op_task_invocation.h"
-#include "task-spec/runtime_arg_config.h"
+#include "task-spec/runtime_arg_config.dtg.h"
 #include "task-spec/task_invocation.dtg.h"
+#include "task-spec/training_layer_plus_context.dtg.h"
+#include "task-spec/training_layer_tensor_group_signature.dtg.h"
 
 namespace FlexFlow {
 
-TaskInvocation lower_to_task_invocation(
-    OpTaskInvocation const &,
-    layer_guid_t const &,
-    std::vector<tensor_guid_t> const &input_tensors,
-    std::vector<TensorShape> const &input_tensor_shapes,
-    std::vector<tensor_guid_t> const &output_tensors,
-    std::vector<tensor_guid_t> const &weight_tensors,
-    std::unordered_map<tensor_guid_t, gradient_tensor_t> const &,
-    std::optional<DeviceSpecificDeviceStates> const &);
+TaskInvocation
+    lower_to_task_invocation(OpTaskInvocation const &op_task_invocation,
+                             TrainingLayerPlusContext const &training_layer,
+                             std::optional<DeviceSpecificDeviceStates> const
+                                 &device_specific_device_states);
+
+std::pair<tensor_sub_slot_id_t, training_tensor_guid_t> lower_tensor_binding(
+    TrainingLayerTensorGroupSignature const &training_layer_signature,
+    SlotGradId const &slot_grad_id,
+    OpTensorSpec const &op_tensor_spec);
+
+TaskArgSpec lower_to_task_arg_spec(
+    OpArgSpec const &op_arg_spec,
+    CGOperatorTensorShapeSignature const &op_shape_signature,
+    layer_guid_t const &layer_guid,
+    std::optional<DeviceSpecificDeviceStates> const
+        &device_specific_device_states);
 
 ConcreteArgSpec lower_to_concrete_arg_spec(RuntimeArgRefSpec const &,
                                            RuntimeArgConfig const &);
 
 ConcreteArgSpec lower_to_concrete_arg_spec(
     OpArgRefSpec const &,
-    std::vector<TensorShape> const &,
+    CGOperatorTensorShapeSignature const &,
     layer_guid_t const &,
     std::optional<DeviceSpecificDeviceStates> const &);
 
diff --git a/lib/task-spec/include/task-spec/op_tensor_slot_spec.struct.toml b/lib/task-spec/include/task-spec/op_tensor_slot_spec.struct.toml
index 109ddf36af..3a388b8559 100644
--- a/lib/task-spec/include/task-spec/op_tensor_slot_spec.struct.toml
+++ b/lib/task-spec/include/task-spec/op_tensor_slot_spec.struct.toml
@@ -10,7 +10,7 @@ features = [
 includes = [
   "task-spec/slot_id_t.dtg.h",
   "task-spec/slot_type.dtg.h",
-  "task-spec/tensor_role.dtg.h",
+  "pcg/tensor_role.dtg.h",
   "task-spec/is_grad.dtg.h",
   "task-spec/op_slot_options.dtg.h",
 ]
diff --git a/lib/task-spec/include/task-spec/op_tensor_spec.h b/lib/task-spec/include/task-spec/op_tensor_spec.h
index c957704a10..6f00a2e38d 100644
--- a/lib/task-spec/include/task-spec/op_tensor_spec.h
+++ b/lib/task-spec/include/task-spec/op_tensor_spec.h
@@ -1,21 +1,15 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TENSOR_SPEC_REF_H
 #define _FLEXFLOW_LOCAL_EXECUTION_OP_TENSOR_SPEC_REF_H
 
-#include "task-spec/op_task_signature.h"
+#include "task-spec/op_tensor_spec.dtg.h"
 
 namespace FlexFlow {
 
-struct OpTensorSpec {
-  TensorRole role;
-  OpSlotOptions slot_option;
-  req<int> idx;
-};
-FF_VISITABLE_STRUCT(OpTensorSpec, role, slot_option, idx);
-
-OpTensorSpec input_tensor(int, OpSlotOptions option = OpSlotOptions::NECESSARY);
-OpTensorSpec output_tensor(int,
+OpTensorSpec input_tensor(nonnegative_int idx,
+                          OpSlotOptions option = OpSlotOptions::NECESSARY);
+OpTensorSpec output_tensor(nonnegative_int idx,
                            OpSlotOptions option = OpSlotOptions::NECESSARY);
-OpTensorSpec weight_tensor(int,
+OpTensorSpec weight_tensor(nonnegative_int idx,
                            OpSlotOptions option = OpSlotOptions::NECESSARY);
 
 } // namespace FlexFlow
diff --git a/lib/task-spec/include/task-spec/op_tensor_spec.struct.toml b/lib/task-spec/include/task-spec/op_tensor_spec.struct.toml
new file mode 100644
index 0000000000..3e790c7e08
--- /dev/null
+++ b/lib/task-spec/include/task-spec/op_tensor_spec.struct.toml
@@ -0,0 +1,28 @@
+namespace = "FlexFlow"
+name = "OpTensorSpec"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "json",
+  "fmt",
+  "rapidcheck",
+]
+
+includes = [
+  "pcg/tensor_role.dtg.h",
+  "task-spec/op_slot_options.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
+[[fields]]
+name = "role"
+type = "::FlexFlow::TensorRole"
+
+[[fields]]
+name = "slot_option"
+type = "::FlexFlow::OpSlotOptions"
+
+[[fields]]
+name = "idx"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/task-spec/include/task-spec/ops/combine.h b/lib/task-spec/include/task-spec/ops/combine.h
deleted file mode 100644
index ea7b3ed365..0000000000
--- a/lib/task-spec/include/task-spec/ops/combine.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef _FLEXFLOW_COMBINE_H
-#define _FLEXFLOW_COMBINE_H
-
-#include "op-attrs/ops/combine_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-#include "task-spec/task_impl_function.dtg.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(CombineAttrs const &);
-
-TaskImplFunction get_combine_fwd_task_impl();
-TaskImplFunction get_combine_bwd_task_impl();
-
-OpTaskSignature get_combine_fwd_signature();
-OpTaskSignature get_combine_bwd_signature();
-
-OpTaskInvocation forward(CombineAttrs const &);
-OpTaskInvocation backward(CombineAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/task-spec/include/task-spec/ops/reduction.h b/lib/task-spec/include/task-spec/ops/reduction.h
deleted file mode 100644
index 5ddf292672..0000000000
--- a/lib/task-spec/include/task-spec/ops/reduction.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef _FLEXFLOW_REDUCTION_H
-#define _FLEXFLOW_REDUCTION_H
-
-#include "op-attrs/ops/reduction_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-#include "task-spec/task_impl_function.dtg.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(ReductionAttrs const &);
-
-TaskImplFunction get_reduction_fwd_task_impl();
-TaskImplFunction get_reduction_bwd_task_impl();
-
-OpTaskSignature get_reduction_fwd_signature();
-OpTaskSignature get_reduction_bwd_signature();
-
-OpTaskInvocation init(ReductionAttrs const &);
-OpTaskInvocation forward(ReductionAttrs const &);
-OpTaskInvocation backward(ReductionAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/task-spec/include/task-spec/ops/repartition.h b/lib/task-spec/include/task-spec/ops/repartition.h
deleted file mode 100644
index dfc42c54e5..0000000000
--- a/lib/task-spec/include/task-spec/ops/repartition.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef _FLEXFLOW_PARTITION_H
-#define _FLEXFLOW_PARTITION_H
-
-#include "op-attrs/ops/repartition_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-#include "task-spec/task_impl_function.dtg.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(RepartitionAttrs const &);
-
-TaskImplFunction get_repartition_init_task_impl();
-TaskImplFunction get_repartition_fwd_task_impl();
-TaskImplFunction get_repartition_bwd_task_impl();
-
-OpTaskSignature get_repartition_init_signature();
-OpTaskSignature get_repartition_fwd_signature();
-OpTaskSignature get_repartition_bwd_signature();
-
-OpTaskInvocation init(RepartitionAttrs const &);
-OpTaskInvocation forward(RepartitionAttrs const &);
-OpTaskInvocation backward(RepartitionAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/task-spec/include/task-spec/ops/replicate.h b/lib/task-spec/include/task-spec/ops/replicate.h
deleted file mode 100644
index 18f6f74b19..0000000000
--- a/lib/task-spec/include/task-spec/ops/replicate.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef _FLEXFLOW_REPLICATE_H
-#define _FLEXFLOW_REPLICATE_H
-
-#include "op-attrs/ops/replicate_attrs.dtg.h"
-#include "task-spec/op_task_invocation.h"
-#include "task-spec/task_impl_function.dtg.h"
-
-namespace FlexFlow {
-
-std::vector<task_id_t> get_task_ids(ReplicateAttrs const &);
-
-TaskImplFunction get_replicate_fwd_task_impl();
-TaskImplFunction get_replicate_bwd_task_impl();
-
-OpTaskSignature get_replicate_fwd_signature();
-OpTaskSignature get_replicate_bwd_signature();
-
-OpTaskInvocation forward(ReplicateAttrs const &);
-OpTaskInvocation backward(ReplicateAttrs const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/task-spec/include/task-spec/ops/reshape.h b/lib/task-spec/include/task-spec/ops/reshape.h
index 29d29ae84c..e5bf7170fb 100644
--- a/lib/task-spec/include/task-spec/ops/reshape.h
+++ b/lib/task-spec/include/task-spec/ops/reshape.h
@@ -9,15 +9,12 @@ namespace FlexFlow {
 
 std::vector<task_id_t> get_task_ids(ReshapeAttrs const &);
 
-TaskImplFunction get_reshape_init_task_impl();
 TaskImplFunction get_reshape_fwd_task_impl();
 TaskImplFunction get_reshape_bwd_task_impl();
 
-OpTaskSignature get_reshape_init_signature();
 OpTaskSignature get_reshape_fwd_signature();
 OpTaskSignature get_reshape_bwd_signature();
 
-OpTaskInvocation init(ReshapeAttrs const &);
 OpTaskInvocation forward(ReshapeAttrs const &);
 OpTaskInvocation backward(ReshapeAttrs const &);
 
diff --git a/lib/task-spec/include/task-spec/ops/topk.h b/lib/task-spec/include/task-spec/ops/topk.h
index 33f2dbc5d7..ca1d43c2ee 100644
--- a/lib/task-spec/include/task-spec/ops/topk.h
+++ b/lib/task-spec/include/task-spec/ops/topk.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_TOPK_H_
-#define _FLEXFLOW_TOPK_H_
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_TOPK_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_TOPK_H
 
 #include "op-attrs/ops/topk_attrs.dtg.h"
 #include "task-spec/op_task_invocation.h"
@@ -9,15 +9,12 @@ namespace FlexFlow {
 
 std::vector<task_id_t> get_task_ids(TopKAttrs const &);
 
-TaskImplFunction get_topk_init_task_impl();
 TaskImplFunction get_topk_fwd_task_impl();
 TaskImplFunction get_topk_bwd_task_impl();
 
-OpTaskSignature get_topk_init_signature();
 OpTaskSignature get_topk_fwd_signature();
 OpTaskSignature get_topk_bwd_signature();
 
-OpTaskInvocation init(TopKAttrs const &);
 OpTaskInvocation forward(TopKAttrs const &);
 OpTaskInvocation backward(TopKAttrs const &);
 
diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/task-spec/include/task-spec/optimizer.h
similarity index 51%
rename from lib/local-execution/include/local-execution/optimizer.h
rename to lib/task-spec/include/task-spec/optimizer.h
index e4a9c78743..5b898d8699 100644
--- a/lib/local-execution/include/local-execution/optimizer.h
+++ b/lib/task-spec/include/task-spec/optimizer.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_
-#define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPTIMIZER_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPTIMIZER_H
 
 #include "pcg/optimizer_attrs.dtg.h"
 #include "pcg/optimizers/adam_optimizer_attrs.dtg.h"
@@ -13,24 +13,24 @@ namespace FlexFlow {
 TaskSignature get_update_signature(OptimizerAttrs const &);
 TaskInvocation get_update_invocation(
     OptimizerAttrs const &,
-    tensor_guid_t const &weight,
-    gradient_tensor_t const &weight_grad,
-    std::vector<optimizer_tensor_t> const &grad_buffer_tensors);
+    forward_tensor_guid_t const &weight,
+    gradient_tensor_guid_t const &weight_grad,
+    std::vector<optimizer_tensor_guid_t> const &grad_buffer_tensors);
 TaskImplFunction get_update_task_impl(OptimizerAttrs const &);
 
 TaskSignature get_sgd_update_signature();
 TaskInvocation sgd_update(SGDOptimizerAttrs const &,
-                          tensor_guid_t const &weight,
-                          gradient_tensor_t const &weight_grad,
-                          optimizer_tensor_t const &sgd_v);
+                          forward_tensor_guid_t const &weight,
+                          gradient_tensor_guid_t const &weight_grad,
+                          optimizer_tensor_guid_t const &sgd_v);
 TaskImplFunction get_sgd_update_task_impl();
 
 TaskSignature get_adam_update_signature();
 TaskInvocation adam_update(AdamOptimizerAttrs const &,
-                           tensor_guid_t const &weight,
-                           gradient_tensor_t const &weight_grad,
-                           optimizer_tensor_t const &adam_v,
-                           optimizer_tensor_t const &adam_m);
+                           forward_tensor_guid_t const &weight,
+                           gradient_tensor_guid_t const &weight_grad,
+                           optimizer_tensor_guid_t const &adam_v,
+                           optimizer_tensor_guid_t const &adam_m);
 TaskImplFunction get_adam_update_task_impl();
 
 } // namespace FlexFlow
diff --git a/lib/task-spec/include/task-spec/lowered_tensor_t.struct.toml b/lib/task-spec/include/task-spec/optimizer_tensor_guid_t.struct.toml
similarity index 78%
rename from lib/task-spec/include/task-spec/lowered_tensor_t.struct.toml
rename to lib/task-spec/include/task-spec/optimizer_tensor_guid_t.struct.toml
index 287e548a5b..dc5f98886f 100644
--- a/lib/task-spec/include/task-spec/lowered_tensor_t.struct.toml
+++ b/lib/task-spec/include/task-spec/optimizer_tensor_guid_t.struct.toml
@@ -1,5 +1,5 @@
 namespace = "FlexFlow"
-name = "lowered_tensor_t"
+name = "optimizer_tensor_guid_t"
 features = [
   "eq",
   "ord",
diff --git a/lib/task-spec/include/task-spec/optimizer_tensor_source.h b/lib/task-spec/include/task-spec/optimizer_tensor_source.h
new file mode 100644
index 0000000000..2f10c5c35b
--- /dev/null
+++ b/lib/task-spec/include/task-spec/optimizer_tensor_source.h
@@ -0,0 +1,22 @@
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPTIMIZER_TENSOR_SOURCE_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPTIMIZER_TENSOR_SOURCE_H
+
+#include "task-spec/optimizer_tensor_guid_t.dtg.h"
+
+namespace FlexFlow {
+
+struct OptimizerTensorSource {
+public:
+  OptimizerTensorSource();
+
+  optimizer_tensor_guid_t new_optimizer_tensor();
+
+  void reset();
+
+private:
+  static int next_available_optimizer_tensor_id;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/task-spec/include/task-spec/parallel_tensor_shape_ref_type.struct.toml b/lib/task-spec/include/task-spec/parallel_tensor_shape_ref_type.struct.toml
index fe340f4451..4ff411d17b 100644
--- a/lib/task-spec/include/task-spec/parallel_tensor_shape_ref_type.struct.toml
+++ b/lib/task-spec/include/task-spec/parallel_tensor_shape_ref_type.struct.toml
@@ -1,6 +1,5 @@
 namespace = "FlexFlow"
 name = "ParallelTensorShapeRefType"
-
 features = [
   "eq",
   "ord",
@@ -9,6 +8,15 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+  "pcg/tensor_role.dtg.h",
+]
+
+[[fields]]
+name = "tensor_role"
+type = "::FlexFlow::TensorRole"
+
 [[fields]]
 name = "idx"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/task-spec/include/task-spec/per_device_op_state.h b/lib/task-spec/include/task-spec/per_device_op_state.h
index 23312d90a5..ae6c93807c 100644
--- a/lib/task-spec/include/task-spec/per_device_op_state.h
+++ b/lib/task-spec/include/task-spec/per_device_op_state.h
@@ -1,8 +1,10 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H
-#define _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_PER_DEVICE_OP_STATE_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_PER_DEVICE_OP_STATE_H
 
+#include "task-spec/concrete_arg_spec.h"
 #include "task-spec/device_specific_device_states.dtg.h"
 #include "task-spec/per_device_op_state.dtg.h"
+#include "utils/type_index.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/include/task-spec/per_device_op_state.variant.toml b/lib/task-spec/include/task-spec/per_device_op_state.variant.toml
index 0171e3e497..7c340447f9 100644
--- a/lib/task-spec/include/task-spec/per_device_op_state.variant.toml
+++ b/lib/task-spec/include/task-spec/per_device_op_state.variant.toml
@@ -3,80 +3,70 @@ name = "PerDeviceOpState"
 features = []
 
 includes = [
-  "kernels/attention_kernels.h",
-  "kernels/batch_norm_kernels.h",
-  "kernels/conv_2d_kernels.h",
-  "kernels/dropout_kernels.h",
-  "kernels/element_binary_kernels.h",
-  "kernels/element_unary_kernels.h",
-  "kernels/gather_kernels.h",
-  "kernels/layer_norm_kernels.h",
-  "kernels/linear_kernels.h",
-  "kernels/partition_kernels.h",
-  "kernels/pool_2d_kernels.h",
-  "kernels/reduce_kernels.h",
-  "kernels/reduction_kernels.h",
-  "kernels/reshape_kernels.h",
-  "kernels/softmax_kernels.h",
-  "kernels/topk_kernels.h",
+  "kernels/mha_per_device_state.dtg.h",
+  "kernels/batch_norm_per_device_state.dtg.h",
+  "kernels/conv_2d_per_device_state.dtg.h",
+  "kernels/dropout_per_device_state.dtg.h",
+  "kernels/element_binary_per_device_state.dtg.h",
+  "kernels/element_unary_per_device_state.dtg.h",
+  "kernels/gather_per_device_state.dtg.h",
+  "kernels/layer_norm_per_device_state.dtg.h",
+  "kernels/linear_per_device_state.dtg.h",
+  "kernels/partition_per_device_state.dtg.h",
+  "kernels/pool_2d_per_device_state.dtg.h",
+  "kernels/reduce_per_device_state.dtg.h",
+  "kernels/softmax_per_device_state.dtg.h",
+  "<optional>",
 ]
 
 [[values]]
-type = "::FlexFlow::MHAPerDeviceState"
+type = "std::optional<::FlexFlow::MHAPerDeviceState>"
 key = "mha_per_device_state"
 
 [[values]]
-type = "::FlexFlow::BatchNormPerDeviceState"
+type = "std::optional<::FlexFlow::BatchNormPerDeviceState>"
 key = "batch_norm_per_device_state"
 
 [[values]]
-type = "::FlexFlow::Conv2DPerDeviceState"
+type = "std::optional<::FlexFlow::Conv2DPerDeviceState>"
 key = "conv2d_per_device_state"
 
 [[values]]
-type = "::FlexFlow::DropoutPerDeviceState"
+type = "std::optional<::FlexFlow::DropoutPerDeviceState>"
 key = "dropout_per_device_state"
 
 [[values]]
-type = "::FlexFlow::ElementBinaryPerDeviceState"
+type = "std::optional<::FlexFlow::ElementBinaryPerDeviceState>"
 key = "element_binary_per_device_state"
 
 [[values]]
-type = "::FlexFlow::ElementUnaryPerDeviceState"
+type = "std::optional<::FlexFlow::ElementUnaryPerDeviceState>"
 key = "element_unary_per_device_state"
 
 [[values]]
-type = "::FlexFlow::GatherPerDeviceState"
+type = "std::optional<::FlexFlow::GatherPerDeviceState>"
 key = "gather_per_device_state"
 
 [[values]]
-type = "::FlexFlow::LayerNormPerDeviceState"
+type = "std::optional<::FlexFlow::LayerNormPerDeviceState>"
 key = "layer_norm_per_device_state"
 
 [[values]]
-type = "::FlexFlow::LinearPerDeviceState"
+type = "std::optional<::FlexFlow::LinearPerDeviceState>"
 key = "linear_per_device_state"
 
 [[values]]
-type = "::FlexFlow::Pool2DPerDeviceState"
+type = "std::optional<::FlexFlow::Pool2DPerDeviceState>"
 key = "pool_2d_per_device_state"
 
 [[values]]
-type = "::FlexFlow::ReducePerDeviceState"
+type = "std::optional<::FlexFlow::ReducePerDeviceState>"
 key = "reduce_per_device_state"
 
 [[values]]
-type = "::FlexFlow::RepartitionPerDeviceState"
+type = "std::optional<::FlexFlow::RepartitionPerDeviceState>"
 key = "repartition_per_device_state"
 
 [[values]]
-type = "::FlexFlow::ReshapePerDeviceState"
-key = "reshape_per_device_state"
-
-[[values]]
-type = "::FlexFlow::SoftmaxPerDeviceState"
+type = "std::optional<::FlexFlow::SoftmaxPerDeviceState>"
 key = "softmax_per_device_state"
-
-[[values]]
-type = "::FlexFlow::TopKPerDeviceState"
-key = "topk_per_device_state"
diff --git a/lib/task-spec/include/task-spec/profiling.h b/lib/task-spec/include/task-spec/profiling.h
index bd50801fc4..91774f69ef 100644
--- a/lib/task-spec/include/task-spec/profiling.h
+++ b/lib/task-spec/include/task-spec/profiling.h
@@ -9,10 +9,13 @@ namespace FlexFlow {
 enum class EnableProfiling { YES, NO };
 
 template <typename F, typename... Ts, typename Str>
-std::optional<float>
-    profile(F const &f, ProfilingSettings profiling, Str s, Ts &&...ts) {
-  std::optional<float> elapsed =
-      profiling_wrapper<F, Ts...>(f, profiling, std::forward<Ts>(ts)...);
+std::optional<float> profile(F const &f,
+                             ProfilingSettings profiling,
+                             DeviceType device_type,
+                             Str s,
+                             Ts &&...ts) {
+  std::optional<float> elapsed = profiling_wrapper<F, Ts...>(
+      f, profiling, device_type, std::forward<Ts>(ts)...);
   if (elapsed.has_value()) {
     spdlog::debug(s, elapsed.value());
   }
diff --git a/lib/task-spec/include/task-spec/runtime_arg_config.h b/lib/task-spec/include/task-spec/runtime_arg_config.h
index f4320bc40b..5358caf331 100644
--- a/lib/task-spec/include/task-spec/runtime_arg_config.h
+++ b/lib/task-spec/include/task-spec/runtime_arg_config.h
@@ -1,18 +1,17 @@
-#ifndef _FLEXFLOW_LOCAL_EXECUTION_RUNTIME_ARG_CONFIG_H
-#define _FLEXFLOW_LOCAL_EXECUTION_RUNTIME_ARG_CONFIG_H
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_RUNTIME_ARG_CONFIG_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_RUNTIME_ARG_CONFIG_H
 
-#include "kernels/ff_handle.h"
-#include "task-spec/device_specific.h"
-#include "task-spec/profiling.h"
+#include "task-spec/runtime_arg_config.dtg.h"
 
 namespace FlexFlow {
 
-struct RuntimeArgConfig {
-public:
-  DeviceSpecific<PerDeviceFFHandle> ff_handle;
-  EnableProfiling enable_profiling;
-  ProfilingSettings profiling_settings;
-};
+RuntimeArgConfig
+    cpu_make_runtime_arg_config(EnableProfiling enable_profiling,
+                                ProfilingSettings profiling_settings);
+RuntimeArgConfig
+    gpu_make_runtime_arg_config(PerDeviceFFHandle const &ff_handle,
+                                EnableProfiling enable_profiling,
+                                ProfilingSettings profiling_settings);
 
 } // namespace FlexFlow
 
diff --git a/lib/task-spec/include/task-spec/runtime_arg_config.struct.toml b/lib/task-spec/include/task-spec/runtime_arg_config.struct.toml
new file mode 100644
index 0000000000..9d77616306
--- /dev/null
+++ b/lib/task-spec/include/task-spec/runtime_arg_config.struct.toml
@@ -0,0 +1,25 @@
+namespace = "FlexFlow"
+name = "RuntimeArgConfig"
+features = []
+
+includes = [
+  "kernels/device_handle_t.dtg.h",
+  "task-spec/device_specific.h",
+  "task-spec/profiling.h",
+]
+
+[[fields]]
+name = "ff_handle"
+type = "::FlexFlow::DeviceSpecific<::FlexFlow::device_handle_t>"
+
+[[fields]]
+name = "enable_profiling"
+type = "::FlexFlow::EnableProfiling"
+
+[[fields]]
+name = "profiling_settings"
+type = "::FlexFlow::ProfilingSettings"
+
+[[fields]]
+name = "kernel_device_type"
+type = "::FlexFlow::DeviceType"
diff --git a/lib/task-spec/include/task-spec/runtime_arg_ref.h b/lib/task-spec/include/task-spec/runtime_arg_ref.h
index 33fccb0106..532482f89e 100644
--- a/lib/task-spec/include/task-spec/runtime_arg_ref.h
+++ b/lib/task-spec/include/task-spec/runtime_arg_ref.h
@@ -1,31 +1,25 @@
 #ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_RUNTIME_ARG_REF_H
 #define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_RUNTIME_ARG_REF_H
 
+#include "kernels/device_handle_t.dtg.h"
+#include "kernels/profiling_settings.dtg.h"
+#include "pcg/device_type.dtg.h"
 #include "task-spec/arg_ref.h"
 #include "task-spec/config.h"
 #include "task-spec/device_specific.h"
-#include "task-spec/profiling.h"
-#include "utils/fmt.h"
-#include "utils/type_index.h"
+#include "task-spec/runtime_arg_ref_type.dtg.h"
 
 namespace FlexFlow {
 
-enum class RuntimeArgRefType {
-  FF_HANDLE,
-  PROFILING_SETTINGS,
-  FF_ITERATION_CONFIG
-};
-
-std::string to_string(RuntimeArgRefType const &);
-
 template <typename T>
 using RuntimeArgRef = ArgRef<RuntimeArgRefType, T>;
 
 using RuntimeArgRefSpec = ArgRefSpec<RuntimeArgRefType>;
 
 RuntimeArgRef<ProfilingSettings> profiling_settings();
-RuntimeArgRef<DeviceSpecific<PerDeviceFFHandle>> ff_handle();
+RuntimeArgRef<DeviceSpecific<device_handle_t>> ff_handle();
 RuntimeArgRef<FFIterationConfig> iteration_config();
+RuntimeArgRef<DeviceType> kernel_device_type();
 
 } // namespace FlexFlow
 
diff --git a/lib/task-spec/include/task-spec/runtime_arg_ref_type.enum.toml b/lib/task-spec/include/task-spec/runtime_arg_ref_type.enum.toml
new file mode 100644
index 0000000000..e33eeebc56
--- /dev/null
+++ b/lib/task-spec/include/task-spec/runtime_arg_ref_type.enum.toml
@@ -0,0 +1,17 @@
+namespace = "FlexFlow"
+name = "RuntimeArgRefType"
+features = [
+  "fmt",
+]
+
+[[values]]
+name = "FF_HANDLE"
+
+[[values]]
+name = "PROFILING_SETTINGS"
+
+[[values]]
+name = "FF_ITERATION_CONFIG"
+
+[[values]]
+name = "KERNEL_DEVICE_TYPE"
diff --git a/lib/task-spec/include/task-spec/task_arg_spec.variant.toml b/lib/task-spec/include/task-spec/task_arg_spec.variant.toml
index 0f81f93405..4829a50ff6 100644
--- a/lib/task-spec/include/task-spec/task_arg_spec.variant.toml
+++ b/lib/task-spec/include/task-spec/task_arg_spec.variant.toml
@@ -7,7 +7,7 @@ features = [
 ]
 
 includes = [
-  "task-spec/concrete_arg.h",
+  "task-spec/concrete_arg_spec.h",
   "task-spec/runtime_arg_ref.h"
 ]
 
diff --git a/lib/task-spec/include/task-spec/task_argument_accessor.h b/lib/task-spec/include/task-spec/task_argument_accessor.h
index c1c42e09a3..a6d71b6b70 100644
--- a/lib/task-spec/include/task-spec/task_argument_accessor.h
+++ b/lib/task-spec/include/task-spec/task_argument_accessor.h
@@ -11,19 +11,7 @@ struct TaskArgumentAccessor {
   // arguments
   template <typename T>
   T const &get_argument(slot_id_t slot) const {
-    if constexpr (PerDeviceOpState::IsPartOfPerDeviceOpState_v<T>) {
-      PerDeviceOpState device_states =
-          this->ptr->get_concrete_arg(slot).get<PerDeviceOpState>();
-      if (device_states.has<T>()) {
-        return device_states.get<T>();
-      } else {
-        throw mk_runtime_error(fmt::format(
-            "Invalid access to PerDeviceOpState attempted, instead it holds: ",
-            device_states.index()));
-      }
-    } else {
-      return this->ptr->get_concrete_arg(slot).get<T>();
-    }
+    return this->ptr->get_concrete_arg(slot).get<T>();
   }
 
   template <typename T>
diff --git a/lib/task-spec/include/task-spec/task_binding.h b/lib/task-spec/include/task-spec/task_binding.h
index a945fec1d7..4cc286e104 100644
--- a/lib/task-spec/include/task-spec/task_binding.h
+++ b/lib/task-spec/include/task-spec/task_binding.h
@@ -1,32 +1,36 @@
 #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H
 #define _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H
 
-#include "task-spec/loss_tensor_t.dtg.h"
-#include "task-spec/lowered_tensor_t.dtg.h"
-#include "task-spec/optimizer_tensor_t.dtg.h"
+#include "task-spec/loss_tensor_guid_t.dtg.h"
+#include "task-spec/optimizer_tensor_guid_t.dtg.h"
 #include "task-spec/slot_id_t.dtg.h"
-#include "task-spec/slot_tensor_type_id.dtg.h"
 #include "task-spec/task_arg_spec.dtg.h"
 #include "task-spec/task_id_t.dtg.h"
 #include "task-spec/task_signature.dtg.h"
-#include "task-spec/tensor_type_t.dtg.h"
+#include "task-spec/tensor_sub_slot_id_t.dtg.h"
+#include "task-spec/training_tensor_guid_t.dtg.h"
 
 namespace FlexFlow {
 
 struct TaskBinding {
-  TaskBinding() = default;
+  TaskBinding();
 
-  void bind(int, tensor_guid_t const &);
-  void bind(slot_id_t, tensor_guid_t const &);
+  explicit TaskBinding(
+      std::unordered_map<tensor_sub_slot_id_t, training_tensor_guid_t> const
+          &tensor_bindings,
+      std::unordered_map<slot_id_t, TaskArgSpec> const &arg_bindings);
 
-  void bind_grad(int, gradient_tensor_t const &);
-  void bind_grad(slot_id_t, gradient_tensor_t const &);
+  void bind(int, forward_tensor_guid_t const &);
+  void bind(slot_id_t, forward_tensor_guid_t const &);
 
-  void bind_optimizer(int, optimizer_tensor_t const &);
-  void bind_optimizer(slot_id_t, optimizer_tensor_t const &);
+  void bind_grad(int, gradient_tensor_guid_t const &);
+  void bind_grad(slot_id_t, gradient_tensor_guid_t const &);
 
-  void bind_loss(int, loss_tensor_t const &);
-  void bind_loss(slot_id_t, loss_tensor_t const &);
+  void bind_optimizer(int, optimizer_tensor_guid_t const &);
+  void bind_optimizer(slot_id_t, optimizer_tensor_guid_t const &);
+
+  void bind_loss(int, loss_tensor_guid_t const &);
+  void bind_loss(slot_id_t, loss_tensor_guid_t const &);
 
   template <typename T>
   void bind_arg(int name, T const &t) {
@@ -51,18 +55,21 @@ struct TaskBinding {
   bool operator==(TaskBinding const &other) const;
   bool operator!=(TaskBinding const &other) const;
 
-  std::unordered_map<SlotTensorTypeId, TensorTypeVariant> const &
+  std::unordered_map<tensor_sub_slot_id_t, training_tensor_guid_t> const &
       get_tensor_bindings() const;
   std::unordered_map<slot_id_t, TaskArgSpec> const &get_arg_bindings() const;
   void insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec);
 
 private:
-  std::unordered_map<SlotTensorTypeId, TensorTypeVariant> tensor_bindings;
+  std::unordered_map<tensor_sub_slot_id_t, training_tensor_guid_t>
+      tensor_bindings;
   std::unordered_map<slot_id_t, TaskArgSpec> arg_bindings;
 
 private:
   std::tuple<decltype(tensor_bindings) const &, decltype(arg_bindings) const &>
       tie() const;
+
+  friend ::std::hash<TaskBinding>;
 };
 
 std::string format_as(TaskBinding const &x);
diff --git a/lib/task-spec/include/task-spec/task_id_t.enum.toml b/lib/task-spec/include/task-spec/task_id_t.enum.toml
index b0c82b5d26..2e8f0a0046 100644
--- a/lib/task-spec/include/task-spec/task_id_t.enum.toml
+++ b/lib/task-spec/include/task-spec/task_id_t.enum.toml
@@ -106,9 +106,6 @@ name = "BATCHNORM_FWD_TASK_ID"
 [[values]]
 name = "BATCHNORM_BWD_TASK_ID"
 
-[[values]]
-name = "BATCHMATMUL_INIT_TASK_ID"
-
 [[values]]
 name = "BATCHMATMUL_FWD_TASK_ID"
 
@@ -178,9 +175,6 @@ name = "REDUCE_FWD_TASK_ID"
 [[values]]
 name = "REDUCE_BWD_TASK_ID"
 
-[[values]]
-name = "RESHAPE_INIT_TASK_ID"
-
 [[values]]
 name = "RESHAPE_FWD_TASK_ID"
 
@@ -196,9 +190,6 @@ name = "REVERSE_FWD_TASK_ID"
 [[values]]
 name = "REVERSE_BWD_TASK_ID"
 
-[[values]]
-name = "TOPK_INIT_TASK_ID"
-
 [[values]]
 name = "TOPK_FWD_TASK_ID"
 
diff --git a/lib/task-spec/include/task-spec/task_signature_impl.h b/lib/task-spec/include/task-spec/task_signature_impl.h
index fcf9b346cf..a781e53485 100644
--- a/lib/task-spec/include/task-spec/task_signature_impl.h
+++ b/lib/task-spec/include/task-spec/task_signature_impl.h
@@ -8,12 +8,14 @@
 
 namespace FlexFlow {
 
-TaskSignatureAndImpl get_task_sig_impl(task_id_t const &);
+TaskSignatureAndImpl get_task_signature_and_impl_for_task_id(task_id_t const &);
 std::vector<task_id_t> get_task_ids(ComputationGraphOpAttrs const &);
 
-OpTaskInvocation init(ComputationGraphOpAttrs const &);
-OpTaskInvocation forward(ComputationGraphOpAttrs const &);
-OpTaskInvocation backward(ComputationGraphOpAttrs const &);
+OpTaskInvocation get_init_op_task_invocation(ComputationGraphOpAttrs const &);
+OpTaskInvocation
+    get_forward_op_task_invocation(ComputationGraphOpAttrs const &);
+OpTaskInvocation
+    get_backward_op_task_invocation(ComputationGraphOpAttrs const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/task-spec/include/task-spec/slot_tensor_type_id.struct.toml b/lib/task-spec/include/task-spec/tensor_sub_slot_id_t.struct.toml
similarity index 90%
rename from lib/task-spec/include/task-spec/slot_tensor_type_id.struct.toml
rename to lib/task-spec/include/task-spec/tensor_sub_slot_id_t.struct.toml
index ab5b981637..a830725a27 100644
--- a/lib/task-spec/include/task-spec/slot_tensor_type_id.struct.toml
+++ b/lib/task-spec/include/task-spec/tensor_sub_slot_id_t.struct.toml
@@ -1,5 +1,5 @@
 namespace = "FlexFlow"
-name = "SlotTensorTypeId"
+name = "tensor_sub_slot_id_t"
 features = [
   "eq",
   "ord",
diff --git a/lib/task-spec/include/task-spec/tensor_type_t.variant.toml b/lib/task-spec/include/task-spec/tensor_type_t.variant.toml
deleted file mode 100644
index b93ed91081..0000000000
--- a/lib/task-spec/include/task-spec/tensor_type_t.variant.toml
+++ /dev/null
@@ -1,31 +0,0 @@
-namespace = "FlexFlow"
-name = "TensorTypeVariant"
-features = [
-  "eq",
-  "ord",
-  "hash",
-  "fmt",
-]
-
-includes = [
-  "pcg/tensor_guid_t.dtg.h",
-  "task-spec/optimizer_tensor_t.dtg.h",
-  "task-spec/gradient_tensor_t.dtg.h",
-  "task-spec/loss_tensor_t.dtg.h"
-]
-
-[[values]]
-type = "::FlexFlow::tensor_guid_t"
-key = "tensor_guid"
-
-[[values]]
-type = "::FlexFlow::gradient_tensor_t"
-key = "gradient_tensor"
-
-[[values]]
-type = "::FlexFlow::optimizer_tensor_t"
-key = "optimizer_tensor"
-
-[[values]]
-type = "::FlexFlow::loss_tensor_t"
-key = "loss_tensor"
diff --git a/lib/task-spec/include/task-spec/training_computation_graph.h b/lib/task-spec/include/task-spec/training_computation_graph.h
new file mode 100644
index 0000000000..1cda57a49e
--- /dev/null
+++ b/lib/task-spec/include/task-spec/training_computation_graph.h
@@ -0,0 +1,68 @@
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TRAINING_COMPUTATION_GRAPH_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TRAINING_COMPUTATION_GRAPH_H
+
+#include "pcg/optimizer_attrs.dtg.h"
+#include "task-spec/forward_tensor_source.h"
+#include "task-spec/gradient_tensor_source.h"
+#include "task-spec/loss_tensor_source.h"
+#include "task-spec/optimizer_tensor_source.h"
+#include "task-spec/training_computation_graph.dtg.h"
+#include "task-spec/training_layer_plus_context.dtg.h"
+#include "task-spec/training_tensor_guid_t.dtg.h"
+
+namespace FlexFlow {
+
+TrainingComputationGraph generate_training_computation_graph(
+    ComputationGraph const &computation_graph,
+    OptimizerAttrs const &optimizer_attrs,
+    tensor_guid_t const &logit_tensor,
+    ForwardTensorSource &forward_tensor_source,
+    GradientTensorSource &gradient_tensor_source,
+    OptimizerTensorSource &optimizer_tensor_source,
+    LossTensorSource &loss_tensor_source);
+
+TrainingTensorGroup
+    get_training_tensor_group_for_tensor_guid(TrainingComputationGraph const &,
+                                              tensor_guid_t);
+TrainingTensorGroupWithAttrs
+    get_training_tensor_group_with_attrs_for_tensor_guid(
+        TrainingComputationGraph const &, tensor_guid_t);
+
+forward_tensor_guid_t
+    get_forward_tensor_guid_for_tensor_guid(TrainingComputationGraph const &,
+                                            tensor_guid_t);
+gradient_tensor_guid_t
+    get_gradient_tensor_guid_for_tensor_guid(TrainingComputationGraph const &,
+                                             tensor_guid_t);
+std::vector<optimizer_tensor_guid_t>
+    get_optimizer_tensor_guids_for_tensor_guid(TrainingComputationGraph const &,
+                                               tensor_guid_t);
+
+tensor_guid_t
+    get_tensor_guid_for_forward_tensor_guid(TrainingComputationGraph const &,
+                                            forward_tensor_guid_t);
+tensor_guid_t
+    get_tensor_guid_for_gradient_tensor_guid(TrainingComputationGraph const &,
+                                             gradient_tensor_guid_t);
+tensor_guid_t
+    get_tensor_guid_for_optimizer_tensor_guid(TrainingComputationGraph const &,
+                                              optimizer_tensor_guid_t);
+
+tensor_guid_t
+    get_tensor_guid_for_training_tensor_guid(TrainingComputationGraph const &,
+                                             training_tensor_guid_t);
+
+std::unordered_set<training_tensor_guid_t>
+    get_all_training_tensors_in_training_computation_graph(
+        TrainingComputationGraph const &);
+
+TrainingLayerPlusContext
+    get_training_layer_plus_context(TrainingComputationGraph const &,
+                                    layer_guid_t);
+
+std::unordered_map<training_tensor_guid_t, TensorShape>
+    get_all_training_tensor_shapes(TrainingComputationGraph const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/task-spec/include/task-spec/training_computation_graph.struct.toml b/lib/task-spec/include/task-spec/training_computation_graph.struct.toml
new file mode 100644
index 0000000000..1e294df7eb
--- /dev/null
+++ b/lib/task-spec/include/task-spec/training_computation_graph.struct.toml
@@ -0,0 +1,27 @@
+namespace = "FlexFlow"
+name = "TrainingComputationGraph"
+features = []
+
+includes = [
+  "pcg/computation_graph.h",
+  "<unordered_map>",
+  "pcg/tensor_guid_t.dtg.h",
+  "task-spec/training_tensor_group.dtg.h",
+  "task-spec/loss_tensor_guid_t.dtg.h",
+]
+
+[[fields]]
+name = "computation_graph"
+type = "::FlexFlow::ComputationGraph"
+
+[[fields]]
+name = "training_tensor_group_for_tensor"
+type = "std::unordered_map<tensor_guid_t, TrainingTensorGroup>"
+
+[[fields]]
+name = "logit_tensor"
+type = "::FlexFlow::tensor_guid_t"
+
+[[fields]]
+name = "label_tensor"
+type = "::FlexFlow::loss_tensor_guid_t"
diff --git a/lib/task-spec/include/task-spec/training_layer_plus_context.h b/lib/task-spec/include/task-spec/training_layer_plus_context.h
new file mode 100644
index 0000000000..4ce1ddf1a9
--- /dev/null
+++ b/lib/task-spec/include/task-spec/training_layer_plus_context.h
@@ -0,0 +1,50 @@
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TRAINING_LAYER_PLUS_CONTEXT_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TRAINING_LAYER_PLUS_CONTEXT_H
+
+#include "pcg/cg_operator_tensor_shape_signature.dtg.h"
+#include "pcg/tensor_role.dtg.h"
+#include "task-spec/training_layer_plus_context.dtg.h"
+#include "task-spec/training_layer_tensor_group_signature.dtg.h"
+
+namespace FlexFlow {
+
+std::vector<TrainingTensorGroupWithAttrs>
+    get_training_tensor_groups_with_attrs_for_role(
+        TrainingLayerPlusContext const &training_layer_plus_context,
+        TensorRole tensor_role);
+
+TrainingTensorGroupWithAttrs
+    get_training_tensor_group_with_attrs_for_role_and_index(
+        TrainingLayerPlusContext const &training_layer_plus_context,
+        TensorRole tensor_role,
+        nonnegative_int index);
+
+std::vector<forward_tensor_guid_t>
+    get_input_tensors(TrainingLayerPlusContext const &);
+std::vector<gradient_tensor_guid_t>
+    get_input_grad_tensors(TrainingLayerPlusContext const &);
+std::vector<TensorShape>
+    get_input_tensor_shapes(TrainingLayerPlusContext const &);
+
+std::vector<forward_tensor_guid_t>
+    get_weight_tensors(TrainingLayerPlusContext const &);
+std::vector<gradient_tensor_guid_t>
+    get_weight_grad_tensors(TrainingLayerPlusContext const &);
+std::vector<TensorShape>
+    get_weight_tensor_shapes(TrainingLayerPlusContext const &);
+
+std::vector<forward_tensor_guid_t>
+    get_output_tensors(TrainingLayerPlusContext const &);
+std::vector<gradient_tensor_guid_t>
+    get_output_grad_tensors(TrainingLayerPlusContext const &);
+std::vector<TensorShape>
+    get_output_tensor_shapes(TrainingLayerPlusContext const &);
+
+TrainingLayerTensorGroupSignature
+    get_tensor_group_signature(TrainingLayerPlusContext const &);
+CGOperatorTensorShapeSignature
+    get_cg_op_shape_signature(TrainingLayerPlusContext const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/task-spec/include/task-spec/training_layer_plus_context.struct.toml b/lib/task-spec/include/task-spec/training_layer_plus_context.struct.toml
new file mode 100644
index 0000000000..9090059351
--- /dev/null
+++ b/lib/task-spec/include/task-spec/training_layer_plus_context.struct.toml
@@ -0,0 +1,29 @@
+namespace = "FlexFlow"
+name = "TrainingLayerPlusContext"
+features = []
+
+includes = [
+  "pcg/layer_guid_t.dtg.h",
+  "pcg/layer_attrs.dtg.h",
+  "task-spec/training_tensor_group_with_attrs.dtg.h",
+]
+
+[[fields]]
+name = "layer_guid"
+type = "::FlexFlow::layer_guid_t"
+
+[[fields]]
+name = "layer_attrs"
+type = "::FlexFlow::LayerAttrs"
+
+[[fields]]
+name = "input_tensor_groups"
+type = "std::vector<::FlexFlow::TrainingTensorGroupWithAttrs>"
+
+[[fields]]
+name = "weight_tensor_groups"
+type = "std::vector<::FlexFlow::TrainingTensorGroupWithAttrs>"
+
+[[fields]]
+name = "output_tensor_groups"
+type = "std::vector<::FlexFlow::TrainingTensorGroupWithAttrs>"
diff --git a/lib/task-spec/include/task-spec/training_layer_tensor_group_signature.h b/lib/task-spec/include/task-spec/training_layer_tensor_group_signature.h
new file mode 100644
index 0000000000..62b11e3af3
--- /dev/null
+++ b/lib/task-spec/include/task-spec/training_layer_tensor_group_signature.h
@@ -0,0 +1,20 @@
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TRAINING_LAYER_TENSOR_GROUP_SIGNATURE_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TRAINING_LAYER_TENSOR_GROUP_SIGNATURE_H
+
+#include "pcg/tensor_role.dtg.h"
+#include "task-spec/training_layer_tensor_group_signature.dtg.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
+
+namespace FlexFlow {
+
+std::vector<TrainingTensorGroup> get_training_tensor_groups_for_role(
+    TrainingLayerTensorGroupSignature const &signature, TensorRole tensor_role);
+
+TrainingTensorGroup get_training_tensor_group_for_role_and_index(
+    TrainingLayerTensorGroupSignature const &signature,
+    TensorRole tensor_role,
+    nonnegative_int index);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/task-spec/include/task-spec/training_layer_tensor_group_signature.struct.toml b/lib/task-spec/include/task-spec/training_layer_tensor_group_signature.struct.toml
new file mode 100644
index 0000000000..d9859559a1
--- /dev/null
+++ b/lib/task-spec/include/task-spec/training_layer_tensor_group_signature.struct.toml
@@ -0,0 +1,19 @@
+namespace = "FlexFlow"
+name = "TrainingLayerTensorGroupSignature"
+features = []
+
+includes = [
+  "task-spec/training_tensor_group.dtg.h",
+]
+
+[[fields]]
+name = "input_tensor_groups"
+type = "std::vector<::FlexFlow::TrainingTensorGroup>"
+
+[[fields]]
+name = "weight_tensor_groups"
+type = "std::vector<::FlexFlow::TrainingTensorGroup>"
+
+[[fields]]
+name = "output_tensor_groups"
+type = "std::vector<::FlexFlow::TrainingTensorGroup>"
diff --git a/lib/task-spec/include/task-spec/training_tensor_group.h b/lib/task-spec/include/task-spec/training_tensor_group.h
new file mode 100644
index 0000000000..40269ceab0
--- /dev/null
+++ b/lib/task-spec/include/task-spec/training_tensor_group.h
@@ -0,0 +1,28 @@
+#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_TRAINING_TENSOR_GROUP_H
+#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_TRAINING_TENSOR_GROUP_H
+
+#include "pcg/optimizer_attrs.dtg.h"
+#include "pcg/tensor_attrs.dtg.h"
+#include "pcg/tensor_guid_t.dtg.h"
+#include "task-spec/forward_tensor_source.h"
+#include "task-spec/gradient_tensor_source.h"
+#include "task-spec/optimizer_tensor_source.h"
+#include "task-spec/training_tensor_group.dtg.h"
+#include "task-spec/training_tensor_guid_t.dtg.h"
+
+namespace FlexFlow {
+
+TrainingTensorGroup make_training_tensor_group_for_tensor_guid_t(
+    tensor_guid_t tensor_guid,
+    TensorAttrs const &tensor_attrs,
+    OptimizerAttrs const &optimizer_attrs,
+    ForwardTensorSource &forward_tensor_source,
+    GradientTensorSource &gradient_tensor_source,
+    OptimizerTensorSource &optimizer_tensor_source);
+
+std::unordered_set<training_tensor_guid_t>
+    get_all_training_tensors_in_tensor_group(TrainingTensorGroup const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/task-spec/include/task-spec/training_tensor_group.struct.toml b/lib/task-spec/include/task-spec/training_tensor_group.struct.toml
new file mode 100644
index 0000000000..eadaac08ad
--- /dev/null
+++ b/lib/task-spec/include/task-spec/training_tensor_group.struct.toml
@@ -0,0 +1,31 @@
+namespace = "FlexFlow"
+name = "TrainingTensorGroup"
+features = [
+  "eq",
+  "ord",
+  "fmt",
+  "hash",
+]
+
+includes = [
+  "task-spec/forward_tensor_guid_t.dtg.h",
+  "task-spec/gradient_tensor_guid_t.dtg.h",
+  "task-spec/optimizer_tensor_guid_t.dtg.h",
+]
+
+src_includes = [
+  "utils/hash/vector.h",
+  "utils/fmt/vector.h",
+]
+
+[[fields]]
+name = "forward_tensor"
+type = "::FlexFlow::forward_tensor_guid_t"
+
+[[fields]]
+name = "gradient_tensor"
+type = "::FlexFlow::gradient_tensor_guid_t"
+
+[[fields]]
+name = "optimizer_tensors"
+type = "std::vector<::FlexFlow::optimizer_tensor_guid_t>"
diff --git a/lib/task-spec/include/task-spec/training_tensor_group_with_attrs.h b/lib/task-spec/include/task-spec/training_tensor_group_with_attrs.h
new file mode 100644
index 0000000000..2560228b1c
--- /dev/null
+++ b/lib/task-spec/include/task-spec/training_tensor_group_with_attrs.h
@@ -0,0 +1,18 @@
+#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TRAINING_TENSOR_GROUP_WITH_ATTRS_H
+#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TRAINING_TENSOR_GROUP_WITH_ATTRS_H
+
+#include "task-spec/training_tensor_group.dtg.h"
+#include "task-spec/training_tensor_group_with_attrs.dtg.h"
+
+namespace FlexFlow {
+
+TrainingTensorGroupWithAttrs
+    make_training_tensor_group_with_attrs_from_group_and_attrs(
+        TrainingTensorGroup const &group, TensorAttrs const &attrs);
+
+TrainingTensorGroup
+    tensor_group_without_attrs(TrainingTensorGroupWithAttrs const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/task-spec/include/task-spec/training_tensor_group_with_attrs.struct.toml b/lib/task-spec/include/task-spec/training_tensor_group_with_attrs.struct.toml
new file mode 100644
index 0000000000..5816214fb3
--- /dev/null
+++ b/lib/task-spec/include/task-spec/training_tensor_group_with_attrs.struct.toml
@@ -0,0 +1,37 @@
+namespace = "FlexFlow"
+name = "TrainingTensorGroupWithAttrs"
+features = [
+  "eq",
+  "ord",
+  "fmt",
+  "hash",
+]
+
+includes = [
+  "pcg/tensor_attrs.dtg.h",
+  "task-spec/forward_tensor_guid_t.dtg.h",
+  "task-spec/gradient_tensor_guid_t.dtg.h",
+  "task-spec/optimizer_tensor_guid_t.dtg.h",
+]
+
+src_includes = [
+  "utils/hash/vector.h",
+  "utils/fmt/vector.h",
+]
+
+[[fields]]
+name = "tensor_attrs"
+type = "::FlexFlow::TensorAttrs"
+
+[[fields]]
+name = "forward_tensor"
+type = "::FlexFlow::forward_tensor_guid_t"
+
+[[fields]]
+name = "gradient_tensor"
+type = "::FlexFlow::gradient_tensor_guid_t"
+
+[[fields]]
+name = "optimizer_tensors"
+type = "std::vector<::FlexFlow::optimizer_tensor_guid_t>"
+
diff --git a/lib/task-spec/include/task-spec/training_tensor_guid_t.variant.toml b/lib/task-spec/include/task-spec/training_tensor_guid_t.variant.toml
new file mode 100644
index 0000000000..d2520dacbf
--- /dev/null
+++ b/lib/task-spec/include/task-spec/training_tensor_guid_t.variant.toml
@@ -0,0 +1,31 @@
+namespace = "FlexFlow"
+name = "training_tensor_guid_t"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "task-spec/forward_tensor_guid_t.dtg.h",
+  "task-spec/optimizer_tensor_guid_t.dtg.h",
+  "task-spec/gradient_tensor_guid_t.dtg.h",
+  "task-spec/loss_tensor_guid_t.dtg.h"
+]
+
+[[values]]
+type = "::FlexFlow::forward_tensor_guid_t"
+key = "forward_tensor"
+
+[[values]]
+type = "::FlexFlow::gradient_tensor_guid_t"
+key = "gradient_tensor"
+
+[[values]]
+type = "::FlexFlow::optimizer_tensor_guid_t"
+key = "optimizer_tensor"
+
+[[values]]
+type = "::FlexFlow::loss_tensor_guid_t"
+key = "loss_tensor"
diff --git a/lib/task-spec/src/task-spec/concrete_arg.cc b/lib/task-spec/src/task-spec/concrete_arg_spec.cc
similarity index 94%
rename from lib/task-spec/src/task-spec/concrete_arg.cc
rename to lib/task-spec/src/task-spec/concrete_arg_spec.cc
index b67b74b19a..05fd703df1 100644
--- a/lib/task-spec/src/task-spec/concrete_arg.cc
+++ b/lib/task-spec/src/task-spec/concrete_arg_spec.cc
@@ -1,4 +1,4 @@
-#include "task-spec/concrete_arg.h"
+#include "task-spec/concrete_arg_spec.h"
 
 namespace FlexFlow {
 
diff --git a/lib/task-spec/src/task-spec/forward_tensor_source.cc b/lib/task-spec/src/task-spec/forward_tensor_source.cc
new file mode 100644
index 0000000000..3d82452377
--- /dev/null
+++ b/lib/task-spec/src/task-spec/forward_tensor_source.cc
@@ -0,0 +1,18 @@
+#include "task-spec/forward_tensor_source.h"
+
+namespace FlexFlow {
+
+int ForwardTensorSource::next_available_forward_tensor_id = 0;
+
+ForwardTensorSource::ForwardTensorSource() {}
+
+forward_tensor_guid_t ForwardTensorSource::new_forward_tensor() {
+  return forward_tensor_guid_t{
+      ForwardTensorSource::next_available_forward_tensor_id++};
+}
+
+void ForwardTensorSource::reset() {
+  ForwardTensorSource::next_available_forward_tensor_id = 0;
+}
+
+} // namespace FlexFlow
diff --git a/lib/local-execution/src/gradient_tensor_source.cc b/lib/task-spec/src/task-spec/gradient_tensor_source.cc
similarity index 55%
rename from lib/local-execution/src/gradient_tensor_source.cc
rename to lib/task-spec/src/task-spec/gradient_tensor_source.cc
index 7dcb947e89..8bc5034634 100644
--- a/lib/local-execution/src/gradient_tensor_source.cc
+++ b/lib/task-spec/src/task-spec/gradient_tensor_source.cc
@@ -1,13 +1,13 @@
-#include "local-execution/gradient_tensor_source.h"
+#include "task-spec/gradient_tensor_source.h"
 
 namespace FlexFlow {
 
-size_t GradientTensorSource::next_available_gradient_tensor_id = 0;
+int GradientTensorSource::next_available_gradient_tensor_id = 0;
 
 GradientTensorSource::GradientTensorSource() {}
 
-gradient_tensor_t GradientTensorSource::new_gradient_tensor() {
-  return gradient_tensor_t{
+gradient_tensor_guid_t GradientTensorSource::new_gradient_tensor() {
+  return gradient_tensor_guid_t{
       GradientTensorSource::next_available_gradient_tensor_id++};
 }
 
diff --git a/lib/local-execution/src/loss_functions.cc b/lib/task-spec/src/task-spec/loss_functions.cc
similarity index 66%
rename from lib/local-execution/src/loss_functions.cc
rename to lib/task-spec/src/task-spec/loss_functions.cc
index 4d0b32fd48..698ca941d3 100644
--- a/lib/local-execution/src/loss_functions.cc
+++ b/lib/task-spec/src/task-spec/loss_functions.cc
@@ -15,14 +15,13 @@
 
 #include "op-attrs/ops/loss_functions.h"
 #include "kernels/loss_function_kernels.h"
-#include "local-execution/loss_functions.h"
-#include "kernels/format_accessor_contents.h"
+#include "task-spec/loss_functions.h"
 #include "task-spec/profiling.h"
 #include "utils/nonnegative_int/nonnegative_int.h"
 
 namespace FlexFlow {
 
-enum Slots { LOGIT, LABEL, LOGIT_GRAD, ATTRS, PROFILING };
+enum Slots { LOGIT, LABEL, LOGIT_GRAD, ATTRS, PROFILING, KERNEL_DEVICE_TYPE };
 
 TaskSignature get_loss_bwd_signature() {
   TaskSignature sig = make_empty_task_signature();
@@ -32,13 +31,14 @@ TaskSignature get_loss_bwd_signature() {
 
   add_arg_slot<LossAttrs>(sig, ATTRS);
   add_arg_slot<ProfilingSettings>(sig, PROFILING);
+  add_arg_slot<DeviceType>(sig, KERNEL_DEVICE_TYPE);
   return sig;
 }
 
 TaskInvocation backward(LossAttrs const &attrs,
-                        tensor_guid_t logit,
-                        gradient_tensor_t logit_grad,
-                        loss_tensor_t label) {
+                        forward_tensor_guid_t logit,
+                        gradient_tensor_guid_t logit_grad,
+                        loss_tensor_guid_t label) {
   TaskBinding b;
   b.bind(LOGIT, logit);
   b.bind_loss(LABEL, label);
@@ -46,6 +46,7 @@ TaskInvocation backward(LossAttrs const &attrs,
 
   b.bind_arg(ATTRS, attrs);
   b.bind_arg(PROFILING, profiling_settings());
+  b.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
 
   return TaskInvocation{task_id_t::LOSS_BWD_TASK_ID, b};
 }
@@ -53,53 +54,63 @@ TaskInvocation backward(LossAttrs const &attrs,
 static void backward_task_impl(TaskArgumentAccessor const &acc) {
   auto attrs = acc.get_argument<LossAttrs>(ATTRS);
   auto profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  auto kernel_device_type = acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto logit_grad = acc.get_tensor_grad<Permissions::RW>(LOGIT_GRAD);
   auto logit = acc.get_tensor<Permissions::RO>(LOGIT);
   auto label = acc.get_loss_tensor<Permissions::RO>(LABEL);
 
-  int batch_size = logit.shape.at(legion_dim_t{1_n}).int_from_positive_int();
+  int batch_size =
+      dim_at_idx(logit.shape.dims, legion_dim_t{1_n}).int_from_positive_int();
   // assuming logit shape is [batch dim, num classes]
 
   LossFunction loss_type = get_loss_function(attrs);
   float scale_factor = 1.0f / batch_size;
   if (loss_type == LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE) {
-    ASSERT(logit.shape.num_elements() == label.shape.num_elements());
-    scale_factor = 2.0f / logit.shape.num_elements().int_from_positive_int();
+    ASSERT(get_num_elements(logit.shape.dims) ==
+           get_num_elements(label.shape.dims));
+    scale_factor =
+        2.0f / get_num_elements(logit.shape.dims).int_from_positive_int();
   }
 
   if (loss_type == LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY) {
     // label shape is [batch dim, 1]
     auto scce_attrs = attrs.get<SparseCategoricalCrossEntropyLossAttrs>();
-    size_t ndim = logit.shape.num_dims().unwrap_nonnegative();
-    int num_classes = logit.shape.at(legion_dim_t{0_n}).int_from_positive_int();
+    size_t ndim = get_num_dims(logit.shape.dims).unwrap_nonnegative();
+    int num_classes =
+        dim_at_idx(logit.shape.dims, legion_dim_t{0_n}).int_from_positive_int();
     ASSERT(logit_grad.shape == logit.shape);
     int k = 1;
     if (scce_attrs.replace_labels) {
-      k = logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1}))
+      k = dim_at_idx(logit.shape.dims, legion_dim_t{nonnegative_int{ndim - 1}})
               .int_from_positive_int() /
-          label.shape.at(legion_dim_t(nonnegative_int{ndim - 1}))
+          dim_at_idx(label.shape.dims, legion_dim_t{nonnegative_int{ndim - 1}})
               .int_from_positive_int(); // TODO FIXME something seems wrong
                                         // here, isn't the numerator guaranteed
                                         // to be 1?
                                         // <--- this is not the case because of
                                         // the potential parallel dim
     }
-    ASSERT(label.shape.sub_shape(legion_dim_t(1_n), std::nullopt) ==
-           logit.shape.sub_shape(legion_dim_t(1_n), std::nullopt));
-    ASSERT(k * label.shape.at(legion_dim_t(nonnegative_int{ndim - 1}))
+    ASSERT(slice_tensor_dims(
+               label.shape.dims, relative_ff_dim_t{0}, relative_ff_dim_t{-2}) ==
+           slice_tensor_dims(
+               logit.shape.dims, relative_ff_dim_t{0}, relative_ff_dim_t{-2}));
+    ASSERT(k * dim_at_idx(label.shape.dims,
+                          legion_dim_t{nonnegative_int{ndim - 1}})
                    .int_from_positive_int() ==
-           logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1}))
+           dim_at_idx(logit.shape.dims, legion_dim_t{nonnegative_int{ndim - 1}})
                .int_from_positive_int());
-    ASSERT(label.shape.at(legion_dim_t(0_n)).int_from_positive_int() == 1);
+    ASSERT(dim_at_idx(label.shape.dims, legion_dim_t(0_n))
+               .int_from_positive_int() == 1);
 
     profile(sparse_categorical_crossentropy_loss_backward_kernel,
             profiling,
+            kernel_device_type,
             "[SparseCategoricalCrossEntropyLoss] backward_time = %.2lfms\n",
             get_float_ptr(logit_grad),
             get_float_ptr(logit),
             reinterpret_cast<int const *>(get_float_ptr(label)),
-            get_num_elements(logit.shape).int_from_positive_int(),
-            get_num_elements(logit_grad.shape).int_from_positive_int(),
+            get_num_elements(logit.shape.dims).int_from_positive_int(),
+            get_num_elements(logit_grad.shape.dims).int_from_positive_int(),
             batch_size,
             num_classes,
             k,
@@ -108,46 +119,41 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) {
     ASSERT(logit.shape == label.shape);
     ASSERT(logit_grad.shape == logit.shape);
     int num_channels =
-        logit.shape.at(legion_dim_t{0_n}).int_from_positive_int();
+        dim_at_idx(logit.shape.dims, legion_dim_t{0_n}).int_from_positive_int();
     switch (loss_type) {
       case LossFunction::CATEGORICAL_CROSSENTROPY: {
-        size_t logit_volume = get_num_elements(logit.shape).int_from_positive_int();
-        size_t logit_grad_volume =
-            get_num_elements(logit_grad.shape).int_from_positive_int();
-
         profile(categorical_crossentropy_loss_backward_kernel,
                 profiling,
+                kernel_device_type,
                 "[CategoricalCrossEntropyLoss] backward_time = %.2lfms\n",
-                get_float_ptr(logit_grad),
-                get_float_ptr(logit),
-                get_float_ptr(label),
-                logit_volume,
-                logit_grad_volume,
+                logit_grad,
+                logit,
+                label,
                 scale_factor);
-
-        
         break;
       }
       case LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE: {
         profile(mean_squared_error_avg_loss_backward_kernel,
                 profiling,
+                kernel_device_type,
                 "[MeanSquaredErrorAvgLoss] backward_time = %.2lfms\n",
                 get_float_ptr(logit_grad),
                 get_float_ptr(logit),
                 get_float_ptr(label),
-                get_num_elements(logit.shape).int_from_positive_int(),
-                get_num_elements(logit_grad.shape).int_from_positive_int(),
+                get_num_elements(logit.shape.dims).int_from_positive_int(),
+                get_num_elements(logit_grad.shape.dims).int_from_positive_int(),
                 scale_factor);
         break;
       }
       case LossFunction::IDENTITY: {
         profile(identity_loss_backward_kernel,
                 profiling,
+                kernel_device_type,
                 "[IdentityLoss] backward_time = %.2lfms\n",
                 get_float_ptr(logit_grad),
                 get_float_ptr(logit),
-                get_num_elements(logit.shape).int_from_positive_int(),
-                get_num_elements(logit_grad.shape).int_from_positive_int(),
+                get_num_elements(logit.shape.dims).int_from_positive_int(),
+                get_num_elements(logit_grad.shape.dims).int_from_positive_int(),
                 scale_factor);
         break;
       }
diff --git a/lib/task-spec/src/task-spec/loss_tensor_source.cc b/lib/task-spec/src/task-spec/loss_tensor_source.cc
new file mode 100644
index 0000000000..13b97fd604
--- /dev/null
+++ b/lib/task-spec/src/task-spec/loss_tensor_source.cc
@@ -0,0 +1,13 @@
+#include "task-spec/loss_tensor_source.h"
+
+namespace FlexFlow {
+
+nonnegative_int LossTensorSource::next_available_loss_tensor_id = 0_n;
+
+LossTensorSource::LossTensorSource() {}
+
+loss_tensor_guid_t LossTensorSource::new_loss_tensor() {
+  return loss_tensor_guid_t{LossTensorSource::next_available_loss_tensor_id++};
+}
+
+} // namespace FlexFlow
diff --git a/lib/task-spec/src/task-spec/op_arg_ref.cc b/lib/task-spec/src/task-spec/op_arg_ref.cc
index a427117982..29c895f1c8 100644
--- a/lib/task-spec/src/task-spec/op_arg_ref.cc
+++ b/lib/task-spec/src/task-spec/op_arg_ref.cc
@@ -2,8 +2,31 @@
 
 namespace FlexFlow {
 
-OpArgRef<ParallelTensorShape> input_parallel_tensor_shape(int idx) {
-  OpArgRefType arg_ref_type = OpArgRefType{ParallelTensorShapeRefType{idx}};
+OpArgRef<ParallelTensorShape> input_parallel_tensor_shape(nonnegative_int idx) {
+  OpArgRefType arg_ref_type = OpArgRefType{ParallelTensorShapeRefType{
+      /*tensor_role=*/TensorRole::INPUT,
+      /*idx=*/idx,
+  }};
+  ArgRef<OpArgRefType, ParallelTensorShape> arg_ref = {arg_ref_type};
+  return arg_ref;
+}
+
+OpArgRef<ParallelTensorShape>
+    weight_parallel_tensor_shape(nonnegative_int idx) {
+  OpArgRefType arg_ref_type = OpArgRefType{ParallelTensorShapeRefType{
+      /*tensor_role=*/TensorRole::WEIGHT,
+      /*idx=*/idx,
+  }};
+  ArgRef<OpArgRefType, ParallelTensorShape> arg_ref = {arg_ref_type};
+  return arg_ref;
+}
+
+OpArgRef<ParallelTensorShape>
+    output_parallel_tensor_shape(nonnegative_int idx) {
+  OpArgRefType arg_ref_type = OpArgRefType{ParallelTensorShapeRefType{
+      /*tensor_role=*/TensorRole::OUTPUT,
+      /*idx=*/idx,
+  }};
   ArgRef<OpArgRefType, ParallelTensorShape> arg_ref = {arg_ref_type};
   return arg_ref;
 }
diff --git a/lib/task-spec/src/task-spec/op_task_to_task_invocation.cc b/lib/task-spec/src/task-spec/op_task_to_task_invocation.cc
index 515d1dc1dc..b33edc9a76 100644
--- a/lib/task-spec/src/task-spec/op_task_to_task_invocation.cc
+++ b/lib/task-spec/src/task-spec/op_task_to_task_invocation.cc
@@ -1,105 +1,161 @@
 #include "task-spec/op_task_to_task_invocation.h"
 #include "op-attrs/parallel_tensor_shape.h"
+#include "pcg/cg_operator_tensor_shape_signature.h"
 #include "pcg/computation_graph.h"
+#include "task-spec/slot_grad_id.dtg.h"
+#include "task-spec/training_layer_plus_context.h"
+#include "task-spec/training_layer_tensor_group_signature.h"
+#include "utils/containers/map_values.h"
+#include "utils/containers/transform.h"
+#include "utils/overload.h"
 
 namespace FlexFlow {
 
-TaskInvocation lower_to_task_invocation(
-    OpTaskInvocation const &op_task_invocation,
-    layer_guid_t const &layer_guid,
-    std::vector<tensor_guid_t> const &input_tensors,
-    std::vector<TensorShape> const &input_tensor_shapes,
-    std::vector<tensor_guid_t> const &output_tensors,
-    std::vector<tensor_guid_t> const &weight_tensors,
-    std::unordered_map<tensor_guid_t, gradient_tensor_t> const
-        &tensor_gradient_mapping,
-    std::optional<DeviceSpecificDeviceStates> const &device_states) {
-  TaskBinding binding;
+TaskInvocation
+    lower_to_task_invocation(OpTaskInvocation const &op_task_invocation,
+                             TrainingLayerPlusContext const &training_layer,
+                             std::optional<DeviceSpecificDeviceStates> const
+                                 &device_specific_device_states) {
 
-  for (auto const &tensor_binding :
-       op_task_invocation.binding.get_tensor_bindings()) {
-    tensor_guid_t tensor_to_bind = [&] {
-      OpTensorSpec tensor_binding_spec = tensor_binding.second;
-      switch (tensor_binding_spec.role) {
-        case TensorRole::INPUT:
-          return input_tensors.at(tensor_binding_spec.idx);
-        case TensorRole::OUTPUT:
-          return output_tensors.at(tensor_binding_spec.idx);
-        case TensorRole::WEIGHT:
-          return weight_tensors.at(tensor_binding_spec.idx);
-        default:
-          throw mk_runtime_error(
-              fmt::format("Invalid tensor role {}", tensor_binding_spec.role));
-      }
-    }();
+  std::unordered_map<tensor_sub_slot_id_t, training_tensor_guid_t>
+      tensor_bindings =
+          transform(op_task_invocation.binding.get_tensor_bindings(),
+                    [&](SlotGradId const &slot_grad_id,
+                        OpTensorSpec const &op_tensor_spec) {
+                      return lower_tensor_binding(
+                          get_tensor_group_signature(training_layer),
+                          slot_grad_id,
+                          op_tensor_spec);
+                    });
 
-    SlotGradId slot_grad_id = tensor_binding.first;
+  std::unordered_map<slot_id_t, TaskArgSpec> arg_bindings = map_values(
+      op_task_invocation.binding.get_arg_bindings(),
+      [&](OpArgSpec const &op_arg_spec) {
+        return lower_to_task_arg_spec(op_arg_spec,
+                                      get_cg_op_shape_signature(training_layer),
+                                      training_layer.layer_guid,
+                                      device_specific_device_states);
+      });
 
-    if (slot_grad_id.is_grad == IsGrad::NO) {
-      binding.bind(slot_grad_id.slot_id, tensor_to_bind);
-    } else if (slot_grad_id.is_grad == IsGrad::YES) {
-      binding.bind_grad(slot_grad_id.slot_id,
-                        tensor_gradient_mapping.at(tensor_to_bind));
-    } else {
-      throw mk_runtime_error(fmt::format("Invalid value for IsGrad {}",
-                                         tensor_binding.first.is_grad));
-    }
-  }
+  return TaskInvocation{
+      op_task_invocation.task_id,
+      TaskBinding{
+          tensor_bindings,
+          arg_bindings,
+      },
+  };
+}
 
-  // args
-  for (auto const &arg_binding :
-       op_task_invocation.binding.get_arg_bindings()) {
-    if (arg_binding.second.has<OpArgRefSpec>()) {
-      ConcreteArgSpec concrete_arg =
-          lower_to_concrete_arg_spec(arg_binding.second.get<OpArgRefSpec>(),
-                                     input_tensor_shapes,
-                                     layer_guid,
-                                     device_states);
-      binding.insert_arg_spec(arg_binding.first, TaskArgSpec{concrete_arg});
-    } else if (arg_binding.second.has<RuntimeArgRefSpec>()) {
-      binding.insert_arg_spec(
-          arg_binding.first,
-          TaskArgSpec{arg_binding.second.get<RuntimeArgRefSpec>()});
-    } else {
-      binding.insert_arg_spec(
-          arg_binding.first,
-          TaskArgSpec{arg_binding.second.get<ConcreteArgSpec>()});
-    }
+std::pair<tensor_sub_slot_id_t, training_tensor_guid_t>
+    lower_tensor_binding(TrainingLayerTensorGroupSignature const &signature,
+                         SlotGradId const &slot_grad_id,
+                         OpTensorSpec const &op_tensor_spec) {
+  auto [tensor_to_bind, gradient_tensor_guid_to_bind] = [&] {
+    TrainingTensorGroup group = get_training_tensor_group_for_role_and_index(
+        signature, op_tensor_spec.role, op_tensor_spec.idx);
+
+    return std::pair{
+        group.forward_tensor,
+        group.gradient_tensor,
+    };
+  }();
+
+  if (slot_grad_id.is_grad == IsGrad::NO) {
+    return std::pair{
+        tensor_sub_slot_id_t{
+            slot_grad_id.slot_id,
+            TensorType::FORWARD,
+        },
+        training_tensor_guid_t{
+            tensor_to_bind,
+        },
+    };
+  } else if (slot_grad_id.is_grad == IsGrad::YES) {
+    return std::pair{
+        tensor_sub_slot_id_t{
+            slot_grad_id.slot_id,
+            TensorType::GRADIENT,
+        },
+        training_tensor_guid_t{
+            gradient_tensor_guid_to_bind,
+        },
+    };
+  } else {
+    PANIC("Invalid value for IsGrad {}", slot_grad_id.is_grad);
   }
+}
 
-  return TaskInvocation{op_task_invocation.task_id, binding};
+TaskArgSpec lower_to_task_arg_spec(
+    OpArgSpec const &op_arg_spec,
+    CGOperatorTensorShapeSignature const &op_shape_signature,
+    layer_guid_t const &layer_guid,
+    std::optional<DeviceSpecificDeviceStates> const
+        &device_specific_device_states) {
+  return op_arg_spec.visit<TaskArgSpec>(overload{
+      [](ConcreteArgSpec const &concrete_arg_spec) {
+        return TaskArgSpec{concrete_arg_spec};
+      },
+      [](RuntimeArgRefSpec const &runtime_arg_ref_spec) {
+        return TaskArgSpec{runtime_arg_ref_spec};
+      },
+      [&](OpArgRefSpec const &op_arg_ref_spec) {
+        return TaskArgSpec{
+            lower_to_concrete_arg_spec(op_arg_ref_spec,
+                                       op_shape_signature,
+                                       layer_guid,
+                                       device_specific_device_states),
+        };
+      },
+  });
 }
 
 ConcreteArgSpec lower_to_concrete_arg_spec(
     OpArgRefSpec const &op_arg_ref_spec,
-    std::vector<TensorShape> const &input_tensor_shapes,
+    CGOperatorTensorShapeSignature const &op_signature,
     layer_guid_t const &op_guid,
     std::optional<DeviceSpecificDeviceStates> const &device_states) {
-  if (op_arg_ref_spec.holds<DeviceSpecificDeviceStates>()) {
-    PerDeviceOpState device_state =
-        get_device_state_from_device_specific(device_states.value(), 0);
-    return ConcreteArgSpec::create(device_state);
-  } else if (op_arg_ref_spec.holds<ParallelTensorShape>()) {
-    ParallelTensorShapeRefType index_op_arg_ref =
-        op_arg_ref_spec.get_ref_type().get<ParallelTensorShapeRefType>();
-    TensorShape input_tensor_shape =
-        input_tensor_shapes.at(index_op_arg_ref.idx);
-    ParallelTensorShape shape = lift_to_parallel(input_tensor_shape);
-    return ConcreteArgSpec::create(shape);
-  } else {
-    throw mk_runtime_error("Unhandled op arg ref type");
-  }
+
+  OpArgRefType op_arg_ref_type = op_arg_ref_spec.get_ref_type();
+  return op_arg_ref_type.visit<ConcreteArgSpec>(overload{
+      [&](PerDeviceOpStateRefType const &) {
+        PerDeviceOpState per_device_op_state =
+            get_device_state_from_device_specific(device_states.value(), 0);
+
+        return per_device_op_state.visit<ConcreteArgSpec>(overload{
+            [&](auto const &x) {
+              ASSERT(matches<decltype(x)>(op_arg_ref_spec.get_type_index()));
+              return ConcreteArgSpec::create(x);
+            },
+        });
+      },
+      [&](ParallelTensorShapeRefType const &ref_type) {
+        TensorShape tensor_shape = tensor_shape_for_role_and_index(
+            /*signature=*/op_signature,
+            /*tensor_role=*/ref_type.tensor_role,
+            /*index=*/ref_type.idx);
+        ParallelTensorShape shape = lift_to_parallel(tensor_shape);
+        return ConcreteArgSpec::create(shape);
+      },
+  });
 }
 
 ConcreteArgSpec
     lower_to_concrete_arg_spec(RuntimeArgRefSpec const &runtime_arg_ref_spec,
                                RuntimeArgConfig const &runtime_arg_config) {
-  if (runtime_arg_ref_spec.holds<DeviceSpecific<PerDeviceFFHandle>>()) {
-    return ConcreteArgSpec::create(*(runtime_arg_config.ff_handle.get(0)));
-  } else if (runtime_arg_ref_spec.holds<ProfilingSettings>()) {
-    return ConcreteArgSpec::create(runtime_arg_config.profiling_settings);
-  } else {
-    throw mk_runtime_error("Unhandled runtime arg ref type");
+  switch (runtime_arg_ref_spec.get_ref_type()) {
+    case RuntimeArgRefType::FF_HANDLE:
+      return ConcreteArgSpec::create(*(runtime_arg_config.ff_handle.get(0)));
+    case RuntimeArgRefType::PROFILING_SETTINGS:
+      return ConcreteArgSpec::create(runtime_arg_config.profiling_settings);
+    case RuntimeArgRefType::FF_ITERATION_CONFIG:
+      PANIC("FF_ITERATION_CONFIG is currently not handled. Please create an "
+            "issue or contact the FlexFlow train developers if you need this "
+            "feature.");
+    case RuntimeArgRefType::KERNEL_DEVICE_TYPE:
+      return ConcreteArgSpec::create(runtime_arg_config.kernel_device_type);
+    default:
+      PANIC(fmt::format("Unhandled RuntimeArgRefType {}",
+                        runtime_arg_ref_spec.get_ref_type()));
   }
 }
 
diff --git a/lib/task-spec/src/task-spec/op_tensor_spec.cc b/lib/task-spec/src/task-spec/op_tensor_spec.cc
index 1d97e6ae16..ed312e47af 100644
--- a/lib/task-spec/src/task-spec/op_tensor_spec.cc
+++ b/lib/task-spec/src/task-spec/op_tensor_spec.cc
@@ -2,16 +2,16 @@
 
 namespace FlexFlow {
 
-OpTensorSpec input_tensor(int idx, OpSlotOptions option) {
-  return {TensorRole::INPUT, option, idx};
+OpTensorSpec input_tensor(nonnegative_int idx, OpSlotOptions option) {
+  return OpTensorSpec{TensorRole::INPUT, option, idx};
 }
 
-OpTensorSpec output_tensor(int idx, OpSlotOptions option) {
-  return {TensorRole::OUTPUT, option, idx};
+OpTensorSpec output_tensor(nonnegative_int idx, OpSlotOptions option) {
+  return OpTensorSpec{TensorRole::OUTPUT, option, idx};
 }
 
-OpTensorSpec weight_tensor(int idx, OpSlotOptions option) {
-  return {TensorRole::WEIGHT, option, idx};
+OpTensorSpec weight_tensor(nonnegative_int idx, OpSlotOptions option) {
+  return OpTensorSpec{TensorRole::WEIGHT, option, idx};
 }
 
 } // namespace FlexFlow
diff --git a/lib/task-spec/src/task-spec/ops/attention.cc b/lib/task-spec/src/task-spec/ops/attention.cc
index 488517a02e..ea2282792a 100644
--- a/lib/task-spec/src/task-spec/ops/attention.cc
+++ b/lib/task-spec/src/task-spec/ops/attention.cc
@@ -15,9 +15,11 @@
 
 #include "task-spec/ops/attention.h"
 #include "kernels/attention_kernels.h"
+#include "kernels/device_handle_t.dtg.h"
 #include "op-attrs/ops/attention.h"
 #include "op-attrs/ops/attention/multihead_attention_parallel_inputs.h"
 #include "task-spec/op_task_signature.h"
+#include "task-spec/profiling.h"
 
 namespace FlexFlow {
 
@@ -39,7 +41,8 @@ enum Slots {
   WEIGHTS,
   OUTPUT,
   HANDLE,
-  PER_DEVICE_STATE
+  PER_DEVICE_STATE,
+  KERNEL_DEVICE_TYPE,
 };
 
 OpTaskInvocation init(MultiHeadAttentionAttrs const &attrs) {
@@ -48,49 +51,66 @@ OpTaskInvocation init(MultiHeadAttentionAttrs const &attrs) {
   b.bind_arg(HANDLE, ff_handle());
   b.bind_arg(ATTRS, attrs);
 
-  b.bind_arg(QUERY_PARALLEL_TENSOR_SHAPE, input_parallel_tensor_shape(0));
-  b.bind_arg(KEY_PARALLEL_TENSOR_SHAPE, input_parallel_tensor_shape(1));
-  b.bind_arg(VALUE_PARALLEL_TENSOR_SHAPE, input_parallel_tensor_shape(2));
+  b.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
+
+  b.bind_arg(QUERY_PARALLEL_TENSOR_SHAPE, input_parallel_tensor_shape(0_n));
+  b.bind_arg(KEY_PARALLEL_TENSOR_SHAPE, input_parallel_tensor_shape(1_n));
+  b.bind_arg(VALUE_PARALLEL_TENSOR_SHAPE, input_parallel_tensor_shape(2_n));
 
   b.bind_arg(QPROJSIZE, get_qProjSize(attrs));
   b.bind_arg(KPROJSIZE, get_kProjSize(attrs));
   b.bind_arg(VPROJSIZE, get_vProjSize(attrs));
   b.bind_arg(OPROJSIZE, get_oProjSize(attrs));
 
-  return {task_id_t::ATTENTION_INIT_TASK_ID, b};
+  return OpTaskInvocation{
+      task_id_t::ATTENTION_INIT_TASK_ID,
+      b,
+  };
 }
 
 OpTaskInvocation forward(MultiHeadAttentionAttrs const &attrs) {
   OpTaskBinding b;
 
-  b.bind(QUERY, input_tensor(0));
-  b.bind(KEY, input_tensor(1));
-  b.bind(VALUE, input_tensor(2));
-  b.bind(WEIGHTS, weight_tensor(0));
-  b.bind(OUTPUT, output_tensor(0));
+  b.bind(QUERY, input_tensor(0_n));
+  b.bind(KEY, input_tensor(1_n));
+  b.bind(VALUE, input_tensor(2_n));
+  b.bind(WEIGHTS, weight_tensor(0_n));
+  b.bind(OUTPUT, output_tensor(0_n));
 
+  b.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
   b.bind_arg(PROFILING, profiling_settings());
-  b.bind_arg(PER_DEVICE_STATE, per_device_op_state<MHAPerDeviceState>());
+  b.bind_arg(PER_DEVICE_STATE,
+             per_device_op_state<std::optional<MHAPerDeviceState>>());
 
-  return {task_id_t::ATTENTION_FWD_TASK_ID, b};
+  return OpTaskInvocation{
+      task_id_t::ATTENTION_FWD_TASK_ID,
+      b,
+  };
 }
 
 OpTaskInvocation backward(MultiHeadAttentionAttrs const &attrs) {
   OpTaskBinding b = infer_bwd_binding(forward(attrs).binding);
 
-  return {task_id_t::ATTENTION_BWD_TASK_ID, b};
+  return OpTaskInvocation{
+      task_id_t::ATTENTION_BWD_TASK_ID,
+      b,
+  };
 }
 
 static DeviceSpecificDeviceStates
     init_task_impl(TaskArgumentAccessor const &acc) {
   auto const &attrs = acc.get_argument<MultiHeadAttentionAttrs>(ATTRS);
   Allocator allocator = acc.get_allocator();
+
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
+
   positive_int qProjSize = acc.get_argument<positive_int>(QPROJSIZE);
   positive_int kProjSize = acc.get_argument<positive_int>(KPROJSIZE);
   positive_int vProjSize = acc.get_argument<positive_int>(VPROJSIZE);
   positive_int oProjSize = acc.get_argument<positive_int>(OPROJSIZE);
 
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  device_handle_t handle = acc.get_argument<device_handle_t>(HANDLE);
   ParallelTensorShape query_parallel_tensor_shape =
       acc.get_argument<ParallelTensorShape>(QUERY_PARALLEL_TENSOR_SHAPE);
   ParallelTensorShape key_parallel_tensor_shape =
@@ -117,23 +137,27 @@ static DeviceSpecificDeviceStates
   positive_int num_samples = get_num_samples(parsed);
   positive_int num_heads = attrs.num_heads;
 
-  MHAPerDeviceState per_device_state =
-      init_kernel(handle,
-                  allocator,
-                  num_samples.int_from_positive_int(),
-                  num_heads.int_from_positive_int(),
-                  qSize.int_from_positive_int(),
-                  kSize.int_from_positive_int(),
-                  vSize.int_from_positive_int(),
-                  qProjSize.int_from_positive_int(),
-                  kProjSize.int_from_positive_int(),
-                  vProjSize.int_from_positive_int(),
-                  oProjSize.int_from_positive_int(),
-                  qoSeqLength.int_from_positive_int(),
-                  kvSeqLength.int_from_positive_int(),
-                  attrs.add_bias_kv);
+  std::optional<MHAPerDeviceState> per_device_state = init_kernel(
+      /*device_type=*/kernel_device_type,
+      /*per_device_ff_handle=*/handle,
+      /*allocator=*/allocator,
+      /*num_samples=*/num_samples.int_from_positive_int(),
+      /*num_heads=*/num_heads.int_from_positive_int(),
+      /*qSize=*/qSize.int_from_positive_int(),
+      /*kSize=*/kSize.int_from_positive_int(),
+      /*vSize=*/vSize.int_from_positive_int(),
+      /*qProjSize=*/qProjSize.int_from_positive_int(),
+      /*kProjSize=*/kProjSize.int_from_positive_int(),
+      /*vProjSize=*/vProjSize.int_from_positive_int(),
+      /*oProjSize=*/oProjSize.int_from_positive_int(),
+      /*qoSeqLength=*/qoSeqLength.int_from_positive_int(),
+      /*kvSeqLength=*/kvSeqLength.int_from_positive_int(),
+      /*add_bias_kv=*/attrs.add_bias_kv);
+
   return DeviceSpecificDeviceStates{
-      DeviceSpecific<MHAPerDeviceState>::create(per_device_state)};
+      DeviceSpecific<std::optional<MHAPerDeviceState>>::create(
+          per_device_state),
+  };
 }
 
 static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
@@ -144,11 +168,14 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  MHAPerDeviceState per_device_state =
-      acc.get_argument<MHAPerDeviceState>(PER_DEVICE_STATE);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
+  std::optional<MHAPerDeviceState> per_device_state =
+      acc.get_argument<std::optional<MHAPerDeviceState>>(PER_DEVICE_STATE);
 
   return profile(forward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[MultiHeadAttention] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  query.get_float_ptr(),
@@ -171,9 +198,11 @@ static std::optional<float>
   auto key_grad = acc.get_tensor_grad<Permissions::RW>(KEY);
   auto value_grad = acc.get_tensor_grad<Permissions::RW>(VALUE);
 
-  MHAPerDeviceState per_device_state =
-      acc.get_argument<MHAPerDeviceState>(PER_DEVICE_STATE);
+  std::optional<MHAPerDeviceState> per_device_state =
+      acc.get_argument<std::optional<MHAPerDeviceState>>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
 
   float *key_grad_ptr =
       (key_grad == query_grad) ? nullptr : key_grad.get_float_ptr();
@@ -181,14 +210,15 @@ static std::optional<float>
                               ? nullptr
                               : value_grad.get_float_ptr();
 
-  assert(value_grad.shape == value.shape);
-  assert(key_grad.shape == key.shape);
+  ASSERT(value_grad.shape == value.shape);
+  ASSERT(key_grad.shape == key.shape);
 
-  assert(query_grad.shape == query.shape);
-  assert(weight_grad.shape.num_elements() == weight.shape.num_elements());
+  ASSERT(query_grad.shape == query.shape);
+  ASSERT(weight_grad.shape == weight.shape);
 
   return profile(backward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[MultiHeadAttention] backward_time = {:.2lf}ms\n",
                  per_device_state,
                  query.get_float_ptr(),
@@ -224,7 +254,7 @@ OpTaskSignature get_attention_init_signature() {
   init.add_arg_slot<MultiHeadAttentionAttrs>(ATTRS);
   init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
 
-  init.add_return_value<MHAPerDeviceState>();
+  init.add_return_value<std::optional<MHAPerDeviceState>>();
 
   return init;
 }
@@ -239,7 +269,8 @@ OpTaskSignature get_attention_fwd_signature() {
   fwd.add_output_slot(OUTPUT);
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
-  fwd.add_unchecked_arg_slot<MHAPerDeviceState>(PER_DEVICE_STATE);
+  fwd.add_unchecked_arg_slot<std::optional<MHAPerDeviceState>>(
+      PER_DEVICE_STATE);
 
   return fwd;
 }
diff --git a/lib/task-spec/src/task-spec/ops/batch_matmul.cc b/lib/task-spec/src/task-spec/ops/batch_matmul.cc
index 1ee9da82d3..f8d6955b41 100644
--- a/lib/task-spec/src/task-spec/ops/batch_matmul.cc
+++ b/lib/task-spec/src/task-spec/ops/batch_matmul.cc
@@ -17,6 +17,7 @@
 #include "kernels/batch_matmul_kernels.h"
 #include "op-attrs/ops/batch_matmul.h"
 #include "task-spec/op_task_signature.h"
+#include "task-spec/profiling.h"
 #include "utils/containers/transform.h"
 #include "utils/nonnegative_int/nonnegative_range.h"
 
@@ -31,28 +32,36 @@ enum Slots {
   OUTPUT, // tensor
   PROFILING,
   HANDLE,
-  ITERATION_CONFIG
+  ITERATION_CONFIG,
+  KERNEL_DEVICE_TYPE,
 };
 
 OpTaskInvocation forward(BatchMatmulAttrs const &attrs) {
   OpTaskBinding fwd;
 
-  fwd.bind(A_INPUT, input_tensor(0));
-  fwd.bind(B_INPUT, input_tensor(1));
-  fwd.bind(OUTPUT, output_tensor(0));
+  fwd.bind(A_INPUT, input_tensor(0_n));
+  fwd.bind(B_INPUT, input_tensor(1_n));
+  fwd.bind(OUTPUT, output_tensor(0_n));
 
   fwd.bind_arg(ATTRS, attrs);
   fwd.bind_arg(HANDLE, ff_handle());
   fwd.bind_arg(PROFILING, profiling_settings());
   fwd.bind_arg(ITERATION_CONFIG, iteration_config());
+  fwd.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
 
-  return {task_id_t::BATCHMATMUL_FWD_TASK_ID, fwd};
+  return OpTaskInvocation{
+      task_id_t::BATCHMATMUL_FWD_TASK_ID,
+      fwd,
+  };
 }
 
 OpTaskInvocation backward(BatchMatmulAttrs const &attrs) {
   OpTaskBinding bwd = infer_bwd_binding(forward(attrs).binding);
 
-  return {task_id_t::BATCHMATMUL_BWD_TASK_ID, bwd};
+  return OpTaskInvocation{
+      task_id_t::BATCHMATMUL_BWD_TASK_ID,
+      bwd,
+  };
 }
 
 static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
@@ -60,27 +69,32 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto b_input = acc.get_tensor<Permissions::RO>(B_INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto attrs = acc.get_argument<BatchMatmulAttrs>(ATTRS);
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  device_handle_t handle = acc.get_argument<device_handle_t>(HANDLE);
 
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   FFIterationConfig iter_config =
       acc.get_argument<FFIterationConfig>(ITERATION_CONFIG);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
 
-  positive_int m = b_input.shape.at(legion_dim_t{0_n});
-  ASSERT(m == output.shape.at(legion_dim_t{0_n}));
-  positive_int n = a_input.shape.at(legion_dim_t{1_n});
-  ASSERT(n == output.shape.at(legion_dim_t{1_n}));
-  positive_int k = a_input.shape.at(legion_dim_t{0_n});
-  ASSERT(k == b_input.shape.at(legion_dim_t{1_n}));
+  positive_int m = dim_at_idx(b_input.shape.dims, legion_dim_t{0_n});
+  ASSERT(m == dim_at_idx(output.shape.dims, legion_dim_t{0_n}));
+  positive_int n = dim_at_idx(a_input.shape.dims, legion_dim_t{1_n});
+  ASSERT(n == dim_at_idx(output.shape.dims, legion_dim_t{1_n}));
+  positive_int k = dim_at_idx(a_input.shape.dims, legion_dim_t{0_n});
+  ASSERT(k == dim_at_idx(b_input.shape.dims, legion_dim_t{1_n}));
 
-  ASSERT(a_input.shape.num_elements() == b_input.shape.num_elements());
-  ASSERT(a_input.shape.num_elements() == output.shape.num_elements());
+  ASSERT(get_num_elements(a_input.shape.dims) ==
+         get_num_elements(b_input.shape.dims));
+  ASSERT(get_num_elements(a_input.shape.dims) ==
+         get_num_elements(output.shape.dims));
 
   positive_int batch = 1_p;
-  for (nonnegative_int i : nonnegative_range(2_n, a_input.shape.num_dims())) {
-    positive_int dim_size = a_input.shape.at(legion_dim_t{i});
-    ASSERT(dim_size == b_input.shape.at(legion_dim_t{i}));
-    ASSERT(dim_size == output.shape.at(legion_dim_t{i}));
+  for (nonnegative_int i :
+       nonnegative_range(2_n, get_num_dims(a_input.shape.dims))) {
+    positive_int dim_size = dim_at_idx(a_input.shape.dims, legion_dim_t{i});
+    ASSERT(dim_size == dim_at_idx(b_input.shape.dims, legion_dim_t{i}));
+    ASSERT(dim_size == dim_at_idx(output.shape.dims, legion_dim_t{i}));
     batch *= dim_size;
   }
 
@@ -92,6 +106,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[BatchMatmul] forward_time = {:.2lf}ms\n",
                  handle,
                  output.get_float_ptr(),
@@ -112,7 +127,9 @@ static std::optional<float>
   FFIterationConfig iter_config =
       acc.get_argument<FFIterationConfig>(ITERATION_CONFIG);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  device_handle_t handle = acc.get_argument<device_handle_t>(HANDLE);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
 
   auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RW>(OUTPUT);
@@ -127,25 +144,29 @@ static std::optional<float>
   ASSERT(b_input.shape == b_input_grad.shape);
 
   // check dins
-  positive_int m = b_input.shape.at(legion_dim_t{0_n});
-  ASSERT(m == output.shape.at(legion_dim_t{0_n}));
-  positive_int n = a_input.shape.at(legion_dim_t{1_n});
-  ASSERT(n == output.shape.at(legion_dim_t{1_n}));
-  positive_int k = a_input.shape.at(legion_dim_t{0_n});
-  ASSERT(k == b_input.shape.at(legion_dim_t{1_n}));
-  ASSERT(a_input.shape.num_elements() == b_input.shape.num_elements());
-  ASSERT(a_input.shape.num_elements() == output.shape.num_elements());
+  positive_int m = dim_at_idx(b_input.shape.dims, legion_dim_t{0_n});
+  ASSERT(m == dim_at_idx(output.shape.dims, legion_dim_t{0_n}));
+  positive_int n = dim_at_idx(a_input.shape.dims, legion_dim_t{1_n});
+  ASSERT(n == dim_at_idx(output.shape.dims, legion_dim_t{1_n}));
+  positive_int k = dim_at_idx(a_input.shape.dims, legion_dim_t{0_n});
+  ASSERT(k == dim_at_idx(b_input.shape.dims, legion_dim_t{1_n}));
+  ASSERT(get_num_elements(a_input.shape.dims) ==
+         get_num_elements(b_input.shape.dims));
+  ASSERT(get_num_elements(a_input.shape.dims) ==
+         get_num_elements(output.shape.dims));
 
   positive_int batch = 1_p;
-  for (nonnegative_int i : nonnegative_range(2_n, a_input.shape.num_dims())) {
-    positive_int dim_size = a_input.shape.at(legion_dim_t{i});
-    ASSERT(dim_size == b_input.shape.at(legion_dim_t{i}));
-    ASSERT(dim_size == output.shape.at(legion_dim_t{i}));
+  for (nonnegative_int i :
+       nonnegative_range(2_n, get_num_dims(a_input.shape.dims))) {
+    positive_int dim_size = dim_at_idx(a_input.shape.dims, legion_dim_t{i});
+    ASSERT(dim_size == dim_at_idx(b_input.shape.dims, legion_dim_t{i}));
+    ASSERT(dim_size == dim_at_idx(output.shape.dims, legion_dim_t{i}));
     batch *= dim_size;
   }
 
   return profile(backward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[BatchMatmul] backward_time = {:.2lf}ms\n",
                  handle,
                  output.get_float_ptr(),
@@ -175,7 +196,8 @@ OpTaskSignature get_batch_matmul_fwd_signature() {
   fwd.add_output_slot(OUTPUT);
   fwd.add_arg_slot<BatchMatmulAttrs>(ATTRS);
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
-  fwd.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
+  fwd.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
+  fwd.add_unchecked_arg_slot<device_handle_t>(HANDLE);
 
   return fwd;
 }
diff --git a/lib/task-spec/src/task-spec/ops/batch_norm.cc b/lib/task-spec/src/task-spec/ops/batch_norm.cc
index 67c5a7d8a2..0599eec3f5 100644
--- a/lib/task-spec/src/task-spec/ops/batch_norm.cc
+++ b/lib/task-spec/src/task-spec/ops/batch_norm.cc
@@ -15,91 +15,112 @@
 
 #include "task-spec/ops/batch_norm.h"
 #include "kernels/batch_norm_kernels.h"
+#include "task-spec/profiling.h"
 
 namespace FlexFlow {
 
 using namespace FlexFlow::Kernels::BatchNorm;
 
 enum Slots {
-  INPUT,  // tensor
-  SCALE,  // tensor
-  BIAS,   // tensor
-  OUTPUT, // tensor
+  INPUT,
+  SCALE,
+  BIAS,
+  OUTPUT,
   ATTRS,
   PROFILING,
   PER_DEVICE_STATE,
   RELU,
-  HANDLE
+  HANDLE,
+  KERNEL_DEVICE_TYPE,
 };
 
 OpTaskInvocation init(BatchNormAttrs const &attrs) {
   OpTaskBinding binding;
 
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(BIAS, input_tensor(2));
-  binding.bind(OUTPUT, output_tensor(0));
+  binding.bind(INPUT, input_tensor(0_n));
+  binding.bind(BIAS, weight_tensor(1_n));
+  binding.bind(OUTPUT, output_tensor(0_n));
 
   binding.bind_arg(ATTRS, attrs);
   binding.bind_arg(PROFILING, profiling_settings());
   binding.bind_arg(HANDLE, ff_handle());
+  binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
 
-  return {task_id_t::BATCHNORM_INIT_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::BATCHNORM_INIT_TASK_ID,
+      binding,
+  };
 }
 
 OpTaskInvocation forward(BatchNormAttrs const &attrs) {
   OpTaskBinding binding;
   binding.bind_arg(PROFILING, profiling_settings());
-  binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_op_state<BatchNormPerDeviceState>());
+  binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
+  binding.bind_arg(
+      PER_DEVICE_STATE,
+      per_device_op_state<std::optional<BatchNormPerDeviceState>>());
 
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(SCALE, input_tensor(1));
-  binding.bind(BIAS, input_tensor(2));
-  binding.bind(OUTPUT, output_tensor(0));
+  binding.bind(INPUT, input_tensor(0_n));
+  binding.bind(SCALE, weight_tensor(0_n));
+  binding.bind(BIAS, weight_tensor(1_n));
+  binding.bind(OUTPUT, output_tensor(0_n));
 
-  return {task_id_t::BATCHNORM_FWD_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::BATCHNORM_FWD_TASK_ID,
+      binding,
+  };
 }
 
 OpTaskInvocation backward(BatchNormAttrs const &attrs) {
   OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
 
-  return {task_id_t::BATCHNORM_BWD_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::BATCHNORM_BWD_TASK_ID,
+      binding,
+  };
 }
 
 static DeviceSpecificDeviceStates
     init_task_impl(TaskArgumentAccessor const &acc) {
   Allocator allocator = acc.get_allocator();
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  device_handle_t handle = acc.get_argument<device_handle_t>(HANDLE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
 
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto const &attrs = acc.get_argument<BatchNormAttrs>(ATTRS);
 
-  positive_int output_w = output.shape.at(legion_dim_t{0_n});
-  positive_int output_h = output.shape.at(legion_dim_t{1_n});
-  positive_int output_c = output.shape.at(legion_dim_t{2_n});
-  positive_int output_n = output.shape.at(legion_dim_t{3_n});
+  positive_int output_w = dim_at_idx(output.shape.dims, legion_dim_t{0_n});
+  positive_int output_h = dim_at_idx(output.shape.dims, legion_dim_t{1_n});
+  positive_int output_c = dim_at_idx(output.shape.dims, legion_dim_t{2_n});
+  positive_int output_n = dim_at_idx(output.shape.dims, legion_dim_t{3_n});
 
   float *runningMean;
 
-  BatchNormPerDeviceState per_device_state =
-      init_kernel(handle,
-                  allocator,
-                  runningMean,
-                  output_n.int_from_positive_int(),
-                  output_c.int_from_positive_int(),
-                  output_h.int_from_positive_int(),
-                  output_w.int_from_positive_int(),
-                  attrs.relu);
+  std::optional<BatchNormPerDeviceState> per_device_state = init_kernel(
+      /*device_type=*/kernel_device_type,
+      /*handle=*/handle,
+      /*allocator=*/allocator,
+      /*runningMean=*/runningMean,
+      /*output_n=*/output_n.int_from_positive_int(),
+      /*output_c=*/output_c.int_from_positive_int(),
+      /*output_h=*/output_h.int_from_positive_int(),
+      /*output_w=*/output_w.int_from_positive_int(),
+      /*relu=*/attrs.relu);
 
   return DeviceSpecificDeviceStates{
-      DeviceSpecific<BatchNormPerDeviceState>::create(per_device_state)};
+      DeviceSpecific<std::optional<BatchNormPerDeviceState>>::create(
+          per_device_state),
+  };
 }
 
 static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto per_device_state =
       acc.get_argument<BatchNormPerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
@@ -108,6 +129,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[BatchNorm] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  input.get_float_ptr(),
@@ -121,6 +143,8 @@ static std::optional<float>
   auto per_device_state =
       acc.get_argument<BatchNormPerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
@@ -132,6 +156,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[BatchNorm] backward_time = {:.2lf}ms\n",
                  per_device_state,
                  output.get_float_ptr(),
@@ -141,7 +166,7 @@ static std::optional<float>
                  scale.get_float_ptr(),
                  scale_grad.get_float_ptr(),
                  bias_grad.get_float_ptr(),
-                 output.shape.num_elements().int_from_positive_int());
+                 get_num_elements(output.shape.dims).int_from_positive_int());
 }
 
 TaskImplFunction get_batch_norm_init_task_impl() {
@@ -162,7 +187,7 @@ OpTaskSignature get_batch_norm_init_signature() {
   init.add_output_slot(OUTPUT);
   init.add_arg_slot<BatchNormAttrs>(ATTRS);
   init.add_arg_slot<bool>(PROFILING);
-  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
+  init.add_unchecked_arg_slot<device_handle_t>(HANDLE);
 
   return init;
 }
@@ -175,7 +200,9 @@ OpTaskSignature get_batch_norm_fwd_signature() {
   fwd.add_input_slot(BIAS);
   fwd.add_output_slot(OUTPUT);
   fwd.add_arg_slot<bool>(PROFILING);
-  fwd.add_unchecked_arg_slot<BatchNormPerDeviceState>(PER_DEVICE_STATE);
+  fwd.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
+  fwd.add_unchecked_arg_slot<std::optional<BatchNormPerDeviceState>>(
+      PER_DEVICE_STATE);
 
   return fwd;
 }
diff --git a/lib/task-spec/src/task-spec/ops/cast.cc b/lib/task-spec/src/task-spec/ops/cast.cc
index 7cf26be95b..0c00f1be58 100644
--- a/lib/task-spec/src/task-spec/ops/cast.cc
+++ b/lib/task-spec/src/task-spec/ops/cast.cc
@@ -15,36 +15,45 @@
 
 #include "task-spec/ops/cast.h"
 #include "kernels/cast_kernels.h"
-
 #include "task-spec/op_task_signature.h"
+#include "task-spec/profiling.h"
 #include "utils/hash-utils.h"
 
 using namespace FlexFlow::Kernels::Cast;
 
 namespace FlexFlow {
 
-enum Slots { INPUT, OUTPUT, ATTRS, PROFILING };
+enum Slots { INPUT, OUTPUT, ATTRS, PROFILING, KERNEL_DEVICE_TYPE };
 
 OpTaskInvocation forward(CastAttrs const &attrs) {
   OpTaskBinding binding;
 
   binding.bind_arg(PROFILING, profiling_settings());
   binding.bind_arg(ATTRS, attrs);
+  binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
 
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
+  binding.bind(INPUT, input_tensor(0_n));
+  binding.bind(OUTPUT, output_tensor(0_n));
 
-  return {task_id_t::CAST_FWD_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::CAST_FWD_TASK_ID,
+      binding,
+  };
 }
 
 OpTaskInvocation backward(CastAttrs const &attrs) {
   OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
 
-  return {task_id_t::CAST_BWD_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::CAST_BWD_TASK_ID,
+      binding,
+  };
 }
 
 static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto const &attrs = acc.get_argument<CastAttrs>(ATTRS);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
@@ -52,6 +61,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[Cast] forward_time = {:.2lf}ms\n",
                  input,
                  output);
@@ -60,6 +70,8 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto const &attrs = acc.get_argument<CastAttrs>(ATTRS);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
@@ -69,6 +81,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[Cast] forward_time = {:.2lf}ms\n",
                  input_grad,
                  output_grad);
@@ -86,6 +99,7 @@ OpTaskSignature get_cast_fwd_signature() {
 
   fwd.add_arg_slot<CastAttrs>(ATTRS);
   fwd.add_arg_slot<bool>(PROFILING);
+  fwd.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
 
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
diff --git a/lib/task-spec/src/task-spec/ops/combine.cc b/lib/task-spec/src/task-spec/ops/combine.cc
deleted file mode 100644
index 41c276facb..0000000000
--- a/lib/task-spec/src/task-spec/ops/combine.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "task-spec/ops/combine.h"
-#include "kernels/combine_kernels.h"
-#include "task-spec/op_task_invocation.h"
-#include "utils/hash-utils.h"
-
-namespace FlexFlow {
-
-using namespace FlexFlow::Kernels::Combine;
-
-enum Slots { INPUT, OUTPUT, PROFILING };
-
-OpTaskInvocation forward(CombineAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(PROFILING, profiling_settings());
-
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-
-  return {task_id_t::COMBINE_FWD_TASK_ID, binding};
-}
-
-OpTaskInvocation backward(CombineAttrs const &attrs) {
-  OpTaskBinding b = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::COMBINE_BWD_TASK_ID, b};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-
-  return profile(forward_kernel,
-                 profiling,
-                 "[Combine] forward_time = {:.2lf}ms\n",
-                 input,
-                 output);
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-
-  auto input_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
-
-  return profile(backward_kernel,
-                 profiling,
-                 "[Combine] backward_time = {:.2lf}ms\n",
-                 input_grad,
-                 output_grad);
-}
-
-OpTaskSignature get_combine_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_arg_slot<bool>(PROFILING);
-  fwd.add_input_slot(INPUT);
-  fwd.add_output_slot(OUTPUT);
-
-  return fwd;
-}
-
-OpTaskSignature get_combine_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_combine_fwd_signature());
-
-  return bwd;
-}
-
-TaskImplFunction get_combine_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-TaskImplFunction get_combine_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-}; // namespace FlexFlow
diff --git a/lib/task-spec/src/task-spec/ops/concat.cc b/lib/task-spec/src/task-spec/ops/concat.cc
index 2cb082d1eb..26aa64f6ec 100644
--- a/lib/task-spec/src/task-spec/ops/concat.cc
+++ b/lib/task-spec/src/task-spec/ops/concat.cc
@@ -16,6 +16,7 @@
 #include "task-spec/ops/concat.h"
 #include "kernels/concat_kernels.h"
 #include "task-spec/op_task_signature.h"
+#include "task-spec/profiling.h"
 #include "task-spec/variadic_tensor_ref.h"
 #include "utils/hash-utils.h"
 
@@ -23,26 +24,43 @@ namespace FlexFlow {
 
 using namespace FlexFlow::Kernels::Concat;
 
-enum Slots { INPUTS, OUTPUT, ATTRS, PROFILING, HANDLE, NUM_INPUTS };
+enum Slots {
+  INPUTS,
+  OUTPUT,
+  ATTRS,
+  PROFILING,
+  HANDLE,
+  NUM_INPUTS,
+  KERNEL_DEVICE_TYPE
+};
 
 OpTaskInvocation forward(ConcatAttrs const &attrs) {
   OpTaskBinding binding;
   binding.bind(INPUTS, get_input_tensors());
-  binding.bind(OUTPUT, output_tensor(0));
+  binding.bind(OUTPUT, output_tensor(0_n));
   binding.bind_arg(PROFILING, profiling_settings());
   binding.bind_arg(ATTRS, attrs);
+  binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
 
-  return {task_id_t::CONCAT_FWD_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::CONCAT_FWD_TASK_ID,
+      binding,
+  };
 }
 
 OpTaskInvocation backward(ConcatAttrs const &attrs) {
   OpTaskBinding b = infer_bwd_binding(forward(attrs).binding);
 
-  return {task_id_t::CONCAT_BWD_TASK_ID, b};
+  return OpTaskInvocation{
+      task_id_t::CONCAT_BWD_TASK_ID,
+      b,
+  };
 }
 
 static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto const &attrs = acc.get_argument<ConcatAttrs>(ATTRS);
 
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
@@ -52,6 +70,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[Concat] forward_time = {:.2lf}ms\n",
                  output,
                  inputs,
@@ -61,6 +80,8 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto const &attrs = acc.get_argument<ConcatAttrs>(ATTRS);
 
   auto input_grads = acc.get_variadic_tensor_grad<Permissions::RW>(INPUTS);
@@ -70,6 +91,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[Concat] backward_time = {:.2lf}ms\n",
                  output_grad,
                  input_grads,
@@ -88,6 +110,7 @@ OpTaskSignature get_concat_fwd_signature() {
 
   fwd.add_arg_slot<ConcatAttrs>(ATTRS);
   fwd.add_arg_slot<bool>(PROFILING);
+  fwd.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
   fwd.add_input_slot(INPUTS, SlotType::VARIADIC);
   fwd.add_output_slot(OUTPUT);
 
diff --git a/lib/task-spec/src/task-spec/ops/conv_2d.cc b/lib/task-spec/src/task-spec/ops/conv_2d.cc
index ea4f7f79df..d7110eabfa 100644
--- a/lib/task-spec/src/task-spec/ops/conv_2d.cc
+++ b/lib/task-spec/src/task-spec/ops/conv_2d.cc
@@ -1,5 +1,6 @@
 #include "task-spec/ops/conv_2d.h"
 #include "kernels/conv_2d_kernels.h"
+#include "task-spec/profiling.h"
 
 namespace FlexFlow {
 
@@ -13,19 +14,24 @@ enum Slots {
   ATTRS,
   PROFILING,
   PER_DEVICE_STATE,
-  HANDLE
+  HANDLE,
+  KERNEL_DEVICE_TYPE,
 };
 
 OpTaskInvocation init(Conv2DAttrs const &attrs) {
   OpTaskBinding binding;
 
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-  binding.bind(FILTER, weight_tensor(0));
+  binding.bind(INPUT, input_tensor(0_n));
+  binding.bind(OUTPUT, output_tensor(0_n));
+  binding.bind(FILTER, weight_tensor(0_n));
   binding.bind_arg(ATTRS, attrs);
   binding.bind_arg(HANDLE, ff_handle());
+  binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
 
-  return {task_id_t::CONV2D_INIT_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::CONV2D_INIT_TASK_ID,
+      binding,
+  };
 }
 
 OpTaskInvocation forward(Conv2DAttrs const &attrs) {
@@ -33,53 +39,68 @@ OpTaskInvocation forward(Conv2DAttrs const &attrs) {
 
   binding.bind_arg(ATTRS, attrs);
   binding.bind_arg(PROFILING, profiling_settings());
+  binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
   binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_op_state<Conv2DPerDeviceState>());
+                   per_device_op_state<std::optional<Conv2DPerDeviceState>>());
 
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-  binding.bind(FILTER, weight_tensor(0));
-  binding.bind(BIAS, weight_tensor(1));
+  binding.bind(INPUT, input_tensor(0_n));
+  binding.bind(OUTPUT, output_tensor(0_n));
+  binding.bind(FILTER, weight_tensor(0_n));
+  binding.bind(BIAS, weight_tensor(1_n));
 
-  return {task_id_t::CONV2D_FWD_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::CONV2D_FWD_TASK_ID,
+      binding,
+  };
 }
 
 OpTaskInvocation backward(Conv2DAttrs const &attrs) {
   OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
 
-  return {task_id_t::CONV2D_BWD_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::CONV2D_BWD_TASK_ID,
+      binding,
+  };
 }
 
 static DeviceSpecificDeviceStates
     init_task_impl(TaskArgumentAccessor const &acc) {
 
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  device_handle_t handle = acc.get_argument<device_handle_t>(HANDLE);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto attrs = acc.get_argument<Conv2DAttrs>(ATTRS);
   auto input = acc.get_tensor<Permissions::WO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto filter = acc.get_tensor<Permissions::RO>(FILTER);
   auto filter_grad = acc.get_tensor_grad<Permissions::RW>(FILTER);
 
-  Conv2DPerDeviceState per_device_state =
-      init_kernel(/*handle=*/handle,
-                  /*activation=*/attrs.activation,
-                  /*kernel_h=*/attrs.kernel_h.int_from_positive_int(),
-                  /*kernel_w=*/attrs.kernel_w.int_from_positive_int(),
-                  /*groups=*/attrs.groups.int_from_positive_int(),
-                  /*padding_h=*/attrs.padding_h.unwrap_nonnegative(),
-                  /*padding_w=*/attrs.padding_w.unwrap_nonnegative(),
-                  /*stride_h=*/attrs.stride_h.int_from_positive_int(),
-                  /*stride_w=*/attrs.stride_w.int_from_positive_int(),
-                  /*input=*/input,
-                  /*output=*/output,
-                  /*filter_ptr=*/filter.get_float_ptr(),
-                  /*filter_grad_ptr=*/filter_grad.get_float_ptr());
+  std::optional<Conv2DPerDeviceState> per_device_state = init_kernel(
+      /*device_type=*/kernel_device_type,
+      /*handle=*/handle,
+      /*activation=*/attrs.activation,
+      /*kernel_h=*/attrs.kernel_h.int_from_positive_int(),
+      /*kernel_w=*/attrs.kernel_w.int_from_positive_int(),
+      /*groups=*/attrs.groups.int_from_positive_int(),
+      /*padding_h=*/attrs.padding_h.unwrap_nonnegative(),
+      /*padding_w=*/attrs.padding_w.unwrap_nonnegative(),
+      /*stride_h=*/attrs.stride_h.int_from_positive_int(),
+      /*stride_w=*/attrs.stride_w.int_from_positive_int(),
+      /*input=*/input,
+      /*output=*/output,
+      /*filter_ptr=*/filter.get_float_ptr(),
+      /*filter_grad_ptr=*/filter_grad.get_float_ptr());
+
   return DeviceSpecificDeviceStates{
-      DeviceSpecific<Conv2DPerDeviceState>::create(per_device_state)};
+      DeviceSpecific<std::optional<Conv2DPerDeviceState>>::create(
+          per_device_state),
+  };
 }
 
 static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto per_device_state =
       acc.get_argument<Conv2DPerDeviceState>(PER_DEVICE_STATE);
   auto attrs = acc.get_argument<Conv2DAttrs>(ATTRS);
@@ -91,6 +112,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[Conv2d] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  input.get_float_ptr(),
@@ -103,6 +125,8 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto per_device_state =
       acc.get_argument<Conv2DPerDeviceState>(PER_DEVICE_STATE);
   auto attrs = acc.get_argument<Conv2DAttrs>(ATTRS);
@@ -118,6 +142,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[Conv2d] backward_time = {:.2lf}ms\n",
                  per_device_state,
                  output.get_float_ptr(),
@@ -147,7 +172,8 @@ OpTaskSignature get_conv_2d_init_signature() {
   init.add_output_slot(OUTPUT);
   init.add_weight_slot(FILTER);
   init.add_arg_slot<Conv2DAttrs>(ATTRS);
-  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
+  init.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
+  init.add_unchecked_arg_slot<device_handle_t>(HANDLE);
 
   init.add_return_value<Conv2DPerDeviceState>();
 
@@ -159,6 +185,7 @@ OpTaskSignature get_conv_2d_fwd_signature() {
 
   fwd.add_arg_slot<bool>(PROFILING);
   fwd.add_unchecked_arg_slot<Conv2DPerDeviceState>(PER_DEVICE_STATE);
+  fwd.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
   fwd.add_arg_slot<Conv2DAttrs>(ATTRS);
 
   fwd.add_input_slot(INPUT);
diff --git a/lib/task-spec/src/task-spec/ops/dropout.cc b/lib/task-spec/src/task-spec/ops/dropout.cc
index d19ace886b..a36506984e 100644
--- a/lib/task-spec/src/task-spec/ops/dropout.cc
+++ b/lib/task-spec/src/task-spec/ops/dropout.cc
@@ -2,65 +2,99 @@
 #include "kernels/dropout_kernels.h"
 #include "task-spec/op_task_invocation.h"
 #include "task-spec/op_task_signature.h"
+#include "task-spec/profiling.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
 
 using namespace FlexFlow::Kernels::Dropout;
 
-enum Slots { INPUT, OUTPUT, ATTRS, PER_DEVICE_STATE, FF_HANDLE, PROFILING };
+enum Slots {
+  INPUT,
+  OUTPUT,
+  ATTRS,
+  PER_DEVICE_STATE,
+  FF_HANDLE,
+  PROFILING,
+  KERNEL_DEVICE_TYPE
+};
 
 OpTaskInvocation init(DropoutAttrs const &attrs) {
   OpTaskBinding binding;
 
   binding.bind_arg(ATTRS, attrs);
   binding.bind_arg(FF_HANDLE, ff_handle());
-  binding.bind(OUTPUT, output_tensor(0));
+  binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
 
-  return {task_id_t::DROPOUT_INIT_TASK_ID, binding};
+  binding.bind(OUTPUT, output_tensor(0_n));
+
+  return OpTaskInvocation{
+      task_id_t::DROPOUT_INIT_TASK_ID,
+      binding,
+  };
 }
 
 OpTaskInvocation forward(DropoutAttrs const &attrs) {
   OpTaskBinding binding;
 
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
+  binding.bind(INPUT, input_tensor(0_n));
+  binding.bind(OUTPUT, output_tensor(0_n));
 
   binding.bind_arg(PROFILING, profiling_settings());
+  binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
   binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_op_state<DropoutPerDeviceState>());
+                   per_device_op_state<std::optional<DropoutPerDeviceState>>());
 
-  return {task_id_t::DROPOUT_FWD_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::DROPOUT_FWD_TASK_ID,
+      binding,
+  };
 }
 
 OpTaskInvocation backward(DropoutAttrs const &attrs) {
   OpTaskBinding b = infer_bwd_binding(forward(attrs).binding);
 
-  return {task_id_t::DROPOUT_BWD_TASK_ID, b};
+  return OpTaskInvocation{
+      task_id_t::DROPOUT_BWD_TASK_ID,
+      b,
+  };
 }
 
 static DeviceSpecificDeviceStates
     init_task_impl(TaskArgumentAccessor const &acc) {
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   Allocator allocator = acc.get_allocator();
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(FF_HANDLE);
+  device_handle_t handle = acc.get_argument<device_handle_t>(FF_HANDLE);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto const &attrs = acc.get_argument<DropoutAttrs>(ATTRS);
 
-  DropoutPerDeviceState per_device_state =
-      init_kernel(handle, attrs.rate, attrs.seed, output.shape, allocator);
+  std::optional<DropoutPerDeviceState> per_device_state =
+      init_kernel(kernel_device_type,
+                  handle,
+                  attrs.rate,
+                  attrs.seed,
+                  output.shape,
+                  allocator);
+
   return DeviceSpecificDeviceStates{
-      DeviceSpecific<DropoutPerDeviceState>::create(per_device_state)};
+      DeviceSpecific<std::optional<DropoutPerDeviceState>>::create(
+          per_device_state),
+  };
 }
 
 static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto per_device_state =
-      acc.get_argument<DropoutPerDeviceState>(PER_DEVICE_STATE);
+      acc.get_argument<std::optional<DropoutPerDeviceState>>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
   return profile(forward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[Dropout] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  input.get_float_ptr(),
@@ -73,12 +107,15 @@ static std::optional<float>
   auto per_device_state =
       acc.get_argument<DropoutPerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
 
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
 
   return profile(backward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[Dropout] backward_time = {:.2lf}ms\n",
                  per_device_state,
                  output_grad.get_float_ptr(),
@@ -99,10 +136,11 @@ OpTaskSignature get_dropout_init_signature() {
   OpTaskSignature init(OpTaskType::INIT);
 
   init.add_arg_slot<DropoutAttrs>(ATTRS);
-  init.add_unchecked_arg_slot<PerDeviceFFHandle>(FF_HANDLE);
+  init.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
+  init.add_unchecked_arg_slot<device_handle_t>(FF_HANDLE);
   init.add_output_slot(OUTPUT);
 
-  init.add_return_value<DropoutPerDeviceState>();
+  init.add_return_value<std::optional<DropoutPerDeviceState>>();
 
   return init;
 }
@@ -110,8 +148,10 @@ OpTaskSignature get_dropout_init_signature() {
 OpTaskSignature get_dropout_fwd_signature() {
   OpTaskSignature fwd(OpTaskType::FWD);
 
-  fwd.add_unchecked_arg_slot<DropoutPerDeviceState>(PER_DEVICE_STATE);
+  fwd.add_unchecked_arg_slot<std::optional<DropoutPerDeviceState>>(
+      PER_DEVICE_STATE);
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
+  fwd.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
 
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
diff --git a/lib/task-spec/src/task-spec/ops/element_binary.cc b/lib/task-spec/src/task-spec/ops/element_binary.cc
index 5356901423..a5f9f012fe 100644
--- a/lib/task-spec/src/task-spec/ops/element_binary.cc
+++ b/lib/task-spec/src/task-spec/ops/element_binary.cc
@@ -1,5 +1,6 @@
 #include "task-spec/ops/element_binary.h"
 #include "kernels/element_binary_kernels.h"
+#include "task-spec/profiling.h"
 #include "task-spec/task_signature_impl.h"
 #include "utils/hash-utils.h"
 
@@ -14,40 +15,55 @@ enum Slots {
   PROFILING,
   PER_DEVICE_STATE,
   HANDLE,
-  ATTRS
+  ATTRS,
+  KERNEL_DEVICE_TYPE,
 };
 
 OpTaskInvocation init(ElementBinaryAttrs const &attrs) {
   OpTaskBinding binding;
 
-  binding.bind(LHS_INPUT, input_tensor(0));
-  binding.bind(RHS_INPUT, input_tensor(1));
-  binding.bind(OUTPUT, output_tensor(0));
+  binding.bind(LHS_INPUT, input_tensor(0_n));
+  binding.bind(RHS_INPUT, input_tensor(1_n));
+  binding.bind(OUTPUT, output_tensor(0_n));
+
   binding.bind_arg(ATTRS, attrs);
   binding.bind_arg(HANDLE, ff_handle());
+  binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
 
-  return {task_id_t::ELEMENTBINARY_INIT_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::ELEMENTBINARY_INIT_TASK_ID,
+      binding,
+  };
 }
 
 OpTaskInvocation forward(ElementBinaryAttrs const &attrs) {
   OpTaskBinding binding;
 
-  binding.bind(LHS_INPUT, input_tensor(0));
-  binding.bind(RHS_INPUT, input_tensor(1));
-  binding.bind(OUTPUT, output_tensor(0));
+  binding.bind(LHS_INPUT, input_tensor(0_n));
+  binding.bind(RHS_INPUT, input_tensor(1_n));
+  binding.bind(OUTPUT, output_tensor(0_n));
+
   binding.bind_arg(ATTRS, attrs);
   binding.bind_arg(PROFILING, profiling_settings());
-  binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_op_state<ElementBinaryPerDeviceState>());
+  binding.bind_arg(
+      PER_DEVICE_STATE,
+      per_device_op_state<std::optional<ElementBinaryPerDeviceState>>());
   binding.bind_arg(HANDLE, ff_handle());
+  binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
 
-  return {task_id_t::ELEMENTBINARY_FWD_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::ELEMENTBINARY_FWD_TASK_ID,
+      binding,
+  };
 }
 
 OpTaskInvocation backward(ElementBinaryAttrs const &attrs) {
   OpTaskBinding b = infer_bwd_binding(forward(attrs).binding);
 
-  return {task_id_t::ELEMENTBINARY_BWD_TASK_ID, b};
+  return OpTaskInvocation{
+      task_id_t::ELEMENTBINARY_BWD_TASK_ID,
+      b,
+  };
 }
 
 static DeviceSpecificDeviceStates
@@ -56,23 +72,31 @@ static DeviceSpecificDeviceStates
   auto input_rhs = acc.get_tensor<Permissions::RO>(RHS_INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  device_handle_t handle = acc.get_argument<device_handle_t>(HANDLE);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto const &attrs = acc.get_argument<ElementBinaryAttrs>(ATTRS);
 
-  ElementBinaryPerDeviceState per_device_state =
-      init_kernel(handle,
+  std::optional<ElementBinaryPerDeviceState> per_device_state =
+      init_kernel(kernel_device_type,
+                  handle,
                   attrs.type,
                   attrs.should_broadcast_lhs,
                   attrs.should_broadcast_rhs,
                   input_lhs.shape,
                   input_rhs.shape,
                   output.shape);
+
   return DeviceSpecificDeviceStates{
-      DeviceSpecific<ElementBinaryPerDeviceState>::create(per_device_state)};
+      DeviceSpecific<std::optional<ElementBinaryPerDeviceState>>::create(
+          per_device_state),
+  };
 }
 
 static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto per_device_state =
       acc.get_argument<ElementBinaryPerDeviceState>(PER_DEVICE_STATE);
   auto const &attrs = acc.get_argument<ElementBinaryAttrs>(ATTRS);
@@ -80,10 +104,11 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto input_lhs = acc.get_tensor<Permissions::RO>(LHS_INPUT);
   auto input_rhs = acc.get_tensor<Permissions::RO>(RHS_INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  device_handle_t handle = acc.get_argument<device_handle_t>(HANDLE);
 
   return profile(forward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[ElementBinary] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  input_lhs.get_float_ptr(),
@@ -99,8 +124,10 @@ static std::optional<float>
   auto per_device_state =
       acc.get_argument<ElementBinaryPerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto const &attrs = acc.get_argument<ElementBinaryAttrs>(ATTRS);
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  device_handle_t handle = acc.get_argument<device_handle_t>(HANDLE);
 
   auto input_lhs = acc.get_tensor<Permissions::RO>(LHS_INPUT);
   auto input_rhs = acc.get_tensor<Permissions::RO>(RHS_INPUT);
@@ -111,6 +138,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[ElementBinary] backward_time = {:.2lf}ms\n",
                  per_device_state,
                  output_grad.get_float_ptr(),
@@ -142,8 +170,10 @@ OpTaskSignature get_element_binary_init_signature() {
   init.add_input_slot(LHS_INPUT);
   init.add_input_slot(RHS_INPUT);
   init.add_output_slot(OUTPUT);
+
   init.add_arg_slot<BatchMatmulAttrs>(ATTRS);
-  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
+  init.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
+  init.add_unchecked_arg_slot<device_handle_t>(HANDLE);
 
   init.add_return_value<ElementBinaryPerDeviceState>();
 
@@ -156,7 +186,8 @@ OpTaskSignature get_element_binary_fwd_signature() {
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
   fwd.add_unchecked_arg_slot<ElementBinaryPerDeviceState>(PER_DEVICE_STATE);
   fwd.add_arg_slot<ElementBinaryAttrs>(ATTRS);
-  fwd.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
+  fwd.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
+  fwd.add_unchecked_arg_slot<device_handle_t>(HANDLE);
 
   fwd.add_input_slot(LHS_INPUT);
   fwd.add_input_slot(RHS_INPUT);
diff --git a/lib/task-spec/src/task-spec/ops/element_unary.cc b/lib/task-spec/src/task-spec/ops/element_unary.cc
index 1f4e651251..f8df53b578 100644
--- a/lib/task-spec/src/task-spec/ops/element_unary.cc
+++ b/lib/task-spec/src/task-spec/ops/element_unary.cc
@@ -1,6 +1,7 @@
 #include "task-spec/ops/element_unary.h"
 #include "kernels/element_unary_kernels.h"
 #include "op-attrs/parallel_tensor_shape.h"
+#include "task-spec/profiling.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
@@ -13,10 +14,12 @@ enum Slots {
   INPUT,
   INPUT_SHAPE,
   OUTPUT,
+  OUTPUT_SHAPE,
   ATTRS,
   HANDLE,
   PROFILING,
-  PER_DEVICE_STATE
+  PER_DEVICE_STATE,
+  KERNEL_DEVICE_TYPE,
 };
 
 /* ElementUnary */
@@ -24,49 +27,67 @@ OpTaskInvocation init(ElementUnaryAttrs const &attrs) {
   OpTaskBinding b;
 
   b.bind_arg(ATTRS, attrs);
-  b.bind_arg(INPUT_SHAPE, input_parallel_tensor_shape(0));
+  b.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
 
-  return {task_id_t::ELEMENTUNARY_INIT_TASK_ID, b};
+  b.bind_arg(INPUT_SHAPE, input_parallel_tensor_shape(0_n));
+  b.bind_arg(OUTPUT_SHAPE, output_parallel_tensor_shape(0_n));
+
+  return OpTaskInvocation{
+      task_id_t::ELEMENTUNARY_INIT_TASK_ID,
+      b,
+  };
 }
 
 OpTaskInvocation forward(ElementUnaryAttrs const &attrs) {
   OpTaskBinding b;
 
-  b.bind(INPUT, input_tensor(0));
-  b.bind(OUTPUT, output_tensor(0));
+  b.bind(INPUT, input_tensor(0_n));
+  b.bind(OUTPUT, output_tensor(0_n));
   b.bind_arg(ATTRS, attrs);
 
   b.bind_arg(HANDLE, ff_handle());
   b.bind_arg(PROFILING, profiling_settings());
+  b.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
   b.bind_arg(PER_DEVICE_STATE,
-             per_device_op_state<ElementUnaryPerDeviceState>());
+             per_device_op_state<std::optional<ElementUnaryPerDeviceState>>());
 
-  return {task_id_t::ELEMENTUNARY_FWD_TASK_ID, b};
+  return OpTaskInvocation{
+      task_id_t::ELEMENTUNARY_FWD_TASK_ID,
+      b,
+  };
 }
 
 OpTaskInvocation backward(ElementUnaryAttrs const &attrs) {
   OpTaskBinding b = infer_bwd_binding(forward(attrs).binding);
 
-  return {task_id_t::ELEMENTUNARY_BWD_TASK_ID, b};
+  return OpTaskInvocation{
+      task_id_t::ELEMENTUNARY_BWD_TASK_ID,
+      b,
+  };
 }
 
 static DeviceSpecificDeviceStates
     init_task_impl(TaskArgumentAccessor const &acc) {
 
   auto attrs = acc.get_argument<ElementUnaryAttrs>(ATTRS);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
 
   ParallelTensorShape input_shape =
       acc.get_argument<ParallelTensorShape>(INPUT_SHAPE);
-
   ParallelTensorShape output_shape =
-      throw_if_unexpected(get_output_shape(attrs, input_shape));
-  ElementUnaryPerDeviceState per_device_state =
-      init_kernel(array_shape_from_tensor_shape(get_piece_shape(input_shape)),
-                  array_shape_from_tensor_shape(get_piece_shape(output_shape)),
+      acc.get_argument<ParallelTensorShape>(OUTPUT_SHAPE);
+
+  std::optional<ElementUnaryPerDeviceState> per_device_state =
+      init_kernel(kernel_device_type,
+                  get_piece_shape(input_shape),
+                  get_piece_shape(output_shape),
                   attrs);
 
   return DeviceSpecificDeviceStates{
-      DeviceSpecific<ElementUnaryPerDeviceState>::create(per_device_state)};
+      DeviceSpecific<std::optional<ElementUnaryPerDeviceState>>::create(
+          per_device_state),
+  };
 }
 
 static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
@@ -74,14 +95,17 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto attrs = acc.get_argument<ElementUnaryAttrs>(ATTRS);
 
-  auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  auto handle = acc.get_argument<device_handle_t>(HANDLE);
 
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto per_device_state =
       acc.get_argument<ElementUnaryPerDeviceState>(PER_DEVICE_STATE);
 
   return profile(forward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[ElementUnary] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  attrs,
@@ -98,14 +122,17 @@ static std::optional<float>
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
 
   auto const &attrs = acc.get_argument<ElementUnaryAttrs>(ATTRS);
-  auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  auto handle = acc.get_argument<device_handle_t>(HANDLE);
 
   auto per_device_state =
       acc.get_argument<ElementUnaryPerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
 
   return profile(backward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[ElementUnary] backward_time = {:.2lf}ms\n",
                  per_device_state,
                  attrs,
@@ -131,7 +158,8 @@ OpTaskSignature get_element_unary_init_signature() {
 
   init.add_arg_slot<ParallelTensorShape>(INPUT_SHAPE);
   init.add_arg_slot<ElementUnaryAttrs>(ATTRS);
-  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
+  init.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
+  init.add_unchecked_arg_slot<device_handle_t>(HANDLE);
 
   init.add_return_value<ElementUnaryPerDeviceState>();
 
@@ -145,6 +173,7 @@ OpTaskSignature get_element_unary_fwd_signature() {
   fwd.add_output_slot(OUTPUT);
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
+  fwd.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
   fwd.add_unchecked_arg_slot<ElementUnaryPerDeviceState>(PER_DEVICE_STATE);
 
   return fwd;
diff --git a/lib/task-spec/src/task-spec/ops/embedding.cc b/lib/task-spec/src/task-spec/ops/embedding.cc
new file mode 100644
index 0000000000..4ba32c8483
--- /dev/null
+++ b/lib/task-spec/src/task-spec/ops/embedding.cc
@@ -0,0 +1,120 @@
+#include "task-spec/ops/embedding.h"
+#include "kernels/embedding_kernels.h"
+#include "task-spec/profiling.h"
+
+namespace FlexFlow {
+
+using namespace FlexFlow::Kernels::Embedding;
+
+enum Slots { INPUT, WEIGHT, OUTPUT, ATTRS, PROFILING, KERNEL_DEVICE_TYPE };
+
+OpTaskInvocation forward(EmbeddingAttrs const &attrs) {
+  OpTaskBinding b;
+
+  b.bind(INPUT, input_tensor(0_n));
+  b.bind(WEIGHT, weight_tensor(0_n));
+  b.bind(OUTPUT, output_tensor(0_n));
+
+  b.bind_arg(ATTRS, attrs);
+  b.bind_arg(PROFILING, profiling_settings());
+  b.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
+
+  return OpTaskInvocation{
+      task_id_t::EMBED_FWD_TASK_ID,
+      b,
+  };
+}
+
+OpTaskInvocation backward(EmbeddingAttrs const &attrs) {
+  OpTaskBinding b = infer_bwd_binding(forward(attrs).binding);
+
+  return OpTaskInvocation{
+      task_id_t::EMBED_BWD_TASK_ID,
+      b,
+  };
+}
+
+static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
+  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
+
+  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  EmbeddingAttrs attrs = acc.get_argument<EmbeddingAttrs>(ATTRS);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
+
+  return profile(
+      forward_kernel,
+      profiling,
+      kernel_device_type,
+      "[Embedding] forward_time = {:.2lf}ms\n",
+      input,
+      output,
+      weight,
+      input.shape.data_type,
+      output.shape.data_type,
+      attrs.aggr,
+      get_num_dims(input.shape.dims).unwrap_nonnegative(),
+      get_num_dims(output.shape.dims).unwrap_nonnegative(),
+      dim_at_idx(input.shape.dims, legion_dim_t{1_n}).int_from_positive_int());
+}
+
+static std::optional<float>
+    backward_task_impl(TaskArgumentAccessor const &acc) {
+  auto input = acc.get_tensor<Permissions::RO>(INPUT);
+  auto output = acc.get_tensor<Permissions::RO>(OUTPUT);
+  auto weight_grad = acc.get_tensor_grad<Permissions::RW>(WEIGHT);
+
+  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  EmbeddingAttrs attrs = acc.get_argument<EmbeddingAttrs>(ATTRS);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
+
+  return profile(
+      backward_kernel,
+      profiling,
+      kernel_device_type,
+      "[Embedding] backward_time = {:.2lf}ms\n",
+      output,
+      input,
+      weight_grad,
+      output.shape.data_type,
+      input.shape.data_type,
+      attrs.aggr,
+      get_num_dims(input.shape.dims).unwrap_nonnegative(),
+      get_num_dims(output.shape.dims).unwrap_nonnegative(),
+      dim_at_idx(input.shape.dims, ff_dim_t{0_n}).int_from_positive_int());
+}
+
+TaskImplFunction get_embedding_fwd_task_impl() {
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
+}
+TaskImplFunction get_embedding_bwd_task_impl() {
+  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
+}
+
+OpTaskSignature get_embedding_fwd_signature() {
+  OpTaskSignature fwd(OpTaskType::FWD);
+
+  fwd.add_input_slot(INPUT);
+  fwd.add_input_slot(OUTPUT);
+  fwd.add_input_slot(WEIGHT);
+
+  fwd.add_arg_slot<EmbeddingAttrs>(ATTRS);
+  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
+  fwd.add_arg_slot<ProfilingSettings>(KERNEL_DEVICE_TYPE);
+
+  return fwd;
+}
+
+OpTaskSignature get_embedding_bwd_signature() {
+  OpTaskSignature bwd = infer_bwd_signature(get_embedding_fwd_signature());
+  return bwd;
+}
+
+std::vector<task_id_t> get_task_ids(EmbeddingAttrs const &) {
+  return {task_id_t::EMBED_FWD_TASK_ID, task_id_t::EMBED_BWD_TASK_ID};
+}
+
+} // namespace FlexFlow
diff --git a/lib/task-spec/src/task-spec/ops/flat.cc b/lib/task-spec/src/task-spec/ops/flat.cc
index 1bc0999e1a..6cec1b383f 100644
--- a/lib/task-spec/src/task-spec/ops/flat.cc
+++ b/lib/task-spec/src/task-spec/ops/flat.cc
@@ -1,35 +1,47 @@
 #include "task-spec/ops/flat.h"
 #include "kernels/flat_kernels.h"
+#include "task-spec/profiling.h"
 
 namespace FlexFlow {
 
 using namespace FlexFlow::Kernels::Flat;
 
-enum SLOTS { INPUT, OUTPUT, HANDLE, PROFILING };
+enum SLOTS { INPUT, OUTPUT, HANDLE, PROFILING, KERNEL_DEVICE_TYPE };
 
 OpTaskInvocation forward(FlatAttrs const &attrs) {
   OpTaskBinding binding;
 
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
+  binding.bind(INPUT, input_tensor(0_n));
+  binding.bind(OUTPUT, output_tensor(0_n));
 
   binding.bind_arg(PROFILING, profiling_settings());
-  return {task_id_t::FLAT_FWD_TASK_ID, binding};
+  binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
+
+  return OpTaskInvocation{
+      task_id_t::FLAT_FWD_TASK_ID,
+      binding,
+  };
 }
 
 OpTaskInvocation backward(FlatAttrs const &attrs) {
   OpTaskBinding b = infer_bwd_binding(forward(attrs).binding);
 
-  return {task_id_t::FLAT_BWD_TASK_ID, b};
+  return OpTaskInvocation{
+      task_id_t::FLAT_BWD_TASK_ID,
+      b,
+  };
 }
 
 static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
   return profile(forward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[Flat] forward_time = {:.2lf}ms\n",
                  input,
                  output.get_float_ptr());
@@ -38,6 +50,8 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
@@ -45,6 +59,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[Flat] backward_time = {:.2lf}ms\n",
                  input,
                  output_grad.get_float_ptr(),
@@ -62,6 +77,7 @@ OpTaskSignature get_flat_fwd_signature() {
   OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
+  fwd.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
 
diff --git a/lib/task-spec/src/task-spec/ops/gather.cc b/lib/task-spec/src/task-spec/ops/gather.cc
index 5f7173a991..7f8aacf9d6 100644
--- a/lib/task-spec/src/task-spec/ops/gather.cc
+++ b/lib/task-spec/src/task-spec/ops/gather.cc
@@ -15,6 +15,8 @@
 
 #include "task-spec/ops/gather.h"
 #include "kernels/gather_kernels.h"
+#include "op-attrs/ff_ordered/get_idxs.h"
+#include "task-spec/profiling.h"
 #include "utils/nonnegative_int/nonnegative_range.h"
 #include <optional>
 
@@ -22,18 +24,31 @@ namespace FlexFlow {
 
 using namespace FlexFlow::Kernels::Gather;
 
-enum Slots { INPUT, OUTPUT, INDEX, ATTRS, HANDLE, PROFILING, PER_DEVICE_STATE };
+enum Slots {
+  INPUT,
+  OUTPUT,
+  INDEX,
+  ATTRS,
+  HANDLE,
+  PROFILING,
+  PER_DEVICE_STATE,
+  KERNEL_DEVICE_TYPE
+};
 
 OpTaskInvocation init(GatherAttrs const &attrs) {
   OpTaskBinding binding;
 
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(INDEX, input_tensor(1));
-  binding.bind(OUTPUT, output_tensor(0));
+  binding.bind(INPUT, input_tensor(0_n));
+  binding.bind(INDEX, input_tensor(1_n));
+  binding.bind(OUTPUT, output_tensor(0_n));
   binding.bind_arg(ATTRS, attrs);
   binding.bind_arg(HANDLE, ff_handle());
+  binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
 
-  return {task_id_t::GATHER_INIT_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::GATHER_INIT_TASK_ID,
+      binding,
+  };
 }
 
 OpTaskInvocation forward(GatherAttrs const &attrs) {
@@ -41,20 +56,27 @@ OpTaskInvocation forward(GatherAttrs const &attrs) {
 
   binding.bind_arg(ATTRS, attrs);
   binding.bind_arg(PROFILING, profiling_settings());
+  binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
   binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_op_state<GatherPerDeviceState>());
+                   per_device_op_state<std::optional<GatherPerDeviceState>>());
 
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-  binding.bind(INDEX, weight_tensor(0));
+  binding.bind(INPUT, input_tensor(0_n));
+  binding.bind(OUTPUT, output_tensor(0_n));
+  binding.bind(INDEX, weight_tensor(0_n));
 
-  return {task_id_t::GATHER_FWD_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::GATHER_FWD_TASK_ID,
+      binding,
+  };
 }
 
 OpTaskInvocation backward(GatherAttrs const &attrs) {
   OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
 
-  return {task_id_t::GATHER_BWD_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::GATHER_BWD_TASK_ID,
+      binding,
+  };
 }
 
 static DeviceSpecificDeviceStates
@@ -63,29 +85,34 @@ static DeviceSpecificDeviceStates
   auto index = acc.get_tensor<Permissions::RO>(INDEX);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  device_handle_t handle = acc.get_argument<device_handle_t>(HANDLE);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto const &attrs = acc.get_argument<GatherAttrs>(ATTRS);
-  legion_dim_t legion_dim =
-      legion_dim_from_ff_dim(attrs.dim, input.shape.num_dims());
 
-  assert(input.shape.num_dims() == index.shape.num_dims());
-  assert(output.shape.num_dims() == index.shape.num_dims());
+  ASSERT(get_num_dims(input.shape.dims) == get_num_dims(index.shape.dims));
+  ASSERT(get_num_dims(output.shape.dims) == get_num_dims(index.shape.dims));
 
-  for (nonnegative_int i : nonnegative_range(input.shape.num_dims())) {
-    assert(index.shape.at(legion_dim_t{i}) == output.shape.at(legion_dim_t{i}));
-    if (i != legion_dim.value) {
-      assert(input.shape.at(legion_dim_t{i}) ==
-             index.shape.at(legion_dim_t{i}));
+  for (ff_dim_t i : get_idxs(input.shape.dims.ff_ordered)) {
+    ASSERT(dim_at_idx(index.shape.dims, i) == dim_at_idx(output.shape.dims, i));
+    if (i != attrs.dim) {
+      ASSERT(dim_at_idx(input.shape.dims, i) ==
+             dim_at_idx(index.shape.dims, i));
     }
   }
 
-  GatherPerDeviceState per_device_state = {handle, legion_dim};
+  std::optional<GatherPerDeviceState> per_device_state =
+      init_kernel(kernel_device_type, handle, attrs.dim);
   return DeviceSpecificDeviceStates{
-      DeviceSpecific<GatherPerDeviceState>::create(per_device_state)};
+      DeviceSpecific<std::optional<GatherPerDeviceState>>::create(
+          per_device_state),
+  };
 }
 
 static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto per_device_state =
       acc.get_argument<GatherPerDeviceState>(PER_DEVICE_STATE);
 
@@ -95,6 +122,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[Gather] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  input,
@@ -105,6 +133,8 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto per_device_state =
       acc.get_argument<GatherPerDeviceState>(PER_DEVICE_STATE);
 
@@ -114,6 +144,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[Gather] backward_time = {:.2lf}ms\n",
                  per_device_state,
                  output_grad,
@@ -139,7 +170,8 @@ OpTaskSignature get_gather_init_signature() {
   init.add_output_slot(OUTPUT);
 
   init.add_arg_slot<GatherAttrs>(ATTRS);
-  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
+  init.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
+  init.add_unchecked_arg_slot<device_handle_t>(HANDLE);
 
   init.add_return_value<GatherPerDeviceState>();
 
@@ -150,6 +182,7 @@ OpTaskSignature get_gather_fwd_signature() {
   OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_arg_slot<bool>(PROFILING);
+  fwd.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
   fwd.add_arg_slot<GatherAttrs>(ATTRS);
 
   fwd.add_input_slot(INPUT);
diff --git a/lib/task-spec/src/task-spec/ops/layer_norm.cc b/lib/task-spec/src/task-spec/ops/layer_norm.cc
index 8db2281bcf..b37e63c2d1 100644
--- a/lib/task-spec/src/task-spec/ops/layer_norm.cc
+++ b/lib/task-spec/src/task-spec/ops/layer_norm.cc
@@ -15,8 +15,11 @@
 
 #include "task-spec/ops/layer_norm.h"
 #include "kernels/layer_norm_kernels.h"
+#include "op-attrs/ff_ordered/transform.h"
 #include "op-attrs/ops/layer_norm.h"
 #include "op-attrs/parallel_tensor_shape.h"
+#include "task-spec/profiling.h"
+#include "utils/containers/product.h"
 #include "utils/exception.h"
 #include "utils/hash-utils.h"
 #include "utils/nonnegative_int/nonnegative_range.h"
@@ -34,37 +37,50 @@ enum Slots {
   BETA,
   PER_DEVICE_STATE,
   ATTRS,
-  HANDLE
+  HANDLE,
+  KERNEL_DEVICE_TYPE,
 };
 
 OpTaskInvocation init(LayerNormAttrs const &attrs) {
   OpTaskBinding b;
 
-  b.bind(INPUT, input_tensor(0));
+  b.bind(INPUT, input_tensor(0_n));
 
   b.bind_arg(HANDLE, ff_handle());
+  b.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
   b.bind_arg(ATTRS, attrs);
 
-  return {task_id_t::LAYERNORM_INIT_TASK_ID, b};
+  return OpTaskInvocation{
+      task_id_t::LAYERNORM_INIT_TASK_ID,
+      b,
+  };
 }
 
 OpTaskInvocation forward(LayerNormAttrs const &attrs) {
   OpTaskBinding b;
 
-  b.bind(INPUT, input_tensor(0));
-  b.bind(OUTPUT, output_tensor(0));
-  b.bind(GAMMA, weight_tensor(0)); // todo, this may have some problem
-  b.bind(BETA, weight_tensor(1));  // how to get gmmam and beta
+  b.bind(INPUT, input_tensor(0_n));
+  b.bind(OUTPUT, output_tensor(0_n));
+  b.bind(GAMMA, weight_tensor(0_n));
+  b.bind(BETA, weight_tensor(1_n));
   b.bind_arg(PROFILING, profiling_settings());
-  b.bind_arg(PER_DEVICE_STATE, per_device_op_state<LayerNormPerDeviceState>());
-
-  return {task_id_t::LAYERNORM_FWD_TASK_ID, b};
+  b.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
+  b.bind_arg(PER_DEVICE_STATE,
+             per_device_op_state<std::optional<LayerNormPerDeviceState>>());
+
+  return OpTaskInvocation{
+      task_id_t::LAYERNORM_FWD_TASK_ID,
+      b,
+  };
 }
 
 OpTaskInvocation backward(LayerNormAttrs const &attrs) {
   OpTaskBinding b = infer_bwd_binding(forward(attrs).binding);
 
-  return {task_id_t::LAYERNORM_BWD_TASK_ID, b};
+  return OpTaskInvocation{
+      task_id_t::LAYERNORM_BWD_TASK_ID,
+      b,
+  };
 }
 
 static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
@@ -74,10 +90,13 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto beta = acc.get_tensor<Permissions::RW>(BETA);
 
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto &state = acc.get_argument<LayerNormPerDeviceState>(PER_DEVICE_STATE);
 
   return profile(forward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[LayerNorm] forward time = {:.2lf}ms\n",
                  state,
                  input,
@@ -97,10 +116,13 @@ static std::optional<float>
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
 
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto &state = acc.get_argument<LayerNormPerDeviceState>(PER_DEVICE_STATE);
 
   return profile(backward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[LayerNorm] backward time = {:.2lf}ms\n",
                  state,
                  output_grad,
@@ -114,33 +136,35 @@ static std::optional<float>
 static DeviceSpecificDeviceStates
     init_task_impl(TaskArgumentAccessor const &acc) {
   auto const &attrs = acc.get_argument<LayerNormAttrs>(ATTRS);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   Allocator allocator = acc.get_allocator();
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
-
-  positive_int M = 1_p;
-  for (int i = 0; i < attrs.axes.size(); i++) {
-    legion_dim_t legion_dim =
-        legion_dim_from_ff_dim(attrs.axes[i], input.shape.num_dims());
-    M *= input.shape.at(legion_dim);
-  }
-  positive_int num_replicas = 1_p;
-  for (nonnegative_int i : nonnegative_range(input.shape.num_dims())) {
-    num_replicas *= input.shape.at(legion_dim_t{i});
-  }
+  auto handle = acc.get_argument<device_handle_t>(HANDLE);
+
+  positive_int M = product(transform(attrs.axes, [&](ff_dim_t dim) {
+    return dim_at_idx(input.shape.dims, dim);
+  }));
+
+  positive_int num_replicas = get_num_elements(input.shape.dims);
+
   positive_int effective_num_elements = M;
   positive_int effective_batch_size =
-      positive_int{input.shape.num_elements() / M};
+      positive_int{get_num_elements(input.shape.dims) / M};
 
-  LayerNormPerDeviceState per_device_state =
-      init_kernel(handle,
+  std::optional<LayerNormPerDeviceState> per_device_state =
+      init_kernel(kernel_device_type,
+                  handle,
                   allocator,
                   attrs.elementwise_affine,
                   effective_batch_size.int_from_positive_int(),
                   effective_num_elements.int_from_positive_int(),
                   attrs.eps);
+
   return DeviceSpecificDeviceStates{
-      DeviceSpecific<LayerNormPerDeviceState>::create(per_device_state)};
+      DeviceSpecific<std::optional<LayerNormPerDeviceState>>::create(
+          per_device_state),
+  };
 }
 
 TaskImplFunction get_layer_norm_init_task_impl() {
@@ -162,6 +186,7 @@ OpTaskSignature get_layer_norm_fwd_signature() {
   fwd.add_weight_slot(BETA);
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
+  fwd.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
   fwd.add_unchecked_arg_slot<LayerNormPerDeviceState>(PER_DEVICE_STATE);
   return fwd;
 }
@@ -176,7 +201,8 @@ OpTaskSignature get_layer_norm_init_signature() {
 
   init.add_input_slot(INPUT);
   init.add_arg_slot<LayerNormAttrs>(ATTRS);
-  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
+  init.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
+  init.add_unchecked_arg_slot<device_handle_t>(HANDLE);
 
   init.add_return_value<LayerNormPerDeviceState>();
   return init;
diff --git a/lib/task-spec/src/task-spec/ops/linear.cc b/lib/task-spec/src/task-spec/ops/linear.cc
index e8be7781f5..9ce02bc7fd 100644
--- a/lib/task-spec/src/task-spec/ops/linear.cc
+++ b/lib/task-spec/src/task-spec/ops/linear.cc
@@ -1,15 +1,14 @@
 #include "task-spec/ops/linear.h"
-#include "kernels/linear_kernels.h"
 #include "kernels/format_accessor_contents.h"
+#include "kernels/linear_kernels.h"
 #include "op-attrs/ff_dim_t.h"
+#include "task-spec/profiling.h"
 #include "task-spec/task_argument_accessor.h"
 #include "utils/exception.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
 
-using namespace FlexFlow::Kernels::Linear;
-
 enum slots {
   INPUT,
   OUTPUT,
@@ -18,72 +17,87 @@ enum slots {
   ATTRS,
   PROFILING,
   HANDLE,
-  PER_DEVICE_STATE
+  PER_DEVICE_STATE,
+  KERNEL_DEVICE_TYPE,
 };
 
 OpTaskInvocation init(LinearAttrs const &attrs) {
   OpTaskBinding binding;
 
   binding.bind_arg(HANDLE, ff_handle());
+  binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
   binding.bind_arg(ATTRS, attrs);
 
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(WEIGHT, weight_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
+  binding.bind(INPUT, input_tensor(0_n));
+  binding.bind(WEIGHT, weight_tensor(0_n));
+  binding.bind(OUTPUT, output_tensor(0_n));
 
-  return {task_id_t::LINEAR_INIT_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::LINEAR_INIT_TASK_ID,
+      binding,
+  };
 }
 
 OpTaskInvocation forward(LinearAttrs const &attrs) {
   OpTaskBinding binding;
 
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(WEIGHT, weight_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
+  binding.bind(INPUT, input_tensor(0_n));
+  binding.bind(WEIGHT, weight_tensor(0_n));
+  binding.bind(OUTPUT, output_tensor(0_n));
   if (attrs.use_bias) {
-    binding.bind(BIAS, weight_tensor(1));
+    binding.bind(BIAS, weight_tensor(1_n));
   }
 
   binding.bind_arg(PROFILING, profiling_settings());
+  binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
   binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_op_state<LinearPerDeviceState>());
+                   per_device_op_state<std::optional<LinearPerDeviceState>>());
   binding.bind_arg(ATTRS, attrs);
 
-  return {task_id_t::LINEAR_FWD_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::LINEAR_FWD_TASK_ID,
+      binding,
+  };
 }
 
 OpTaskInvocation backward(LinearAttrs const &attrs) {
   OpTaskBinding b = infer_bwd_binding(forward(attrs).binding);
 
-  return {task_id_t::LINEAR_BWD_TASK_ID, b};
+  return OpTaskInvocation{
+      task_id_t::LINEAR_BWD_TASK_ID,
+      b,
+  };
 }
 
 static DeviceSpecificDeviceStates
     init_task_impl(TaskArgumentAccessor const &acc) {
   auto const &attrs = acc.get_argument<LinearAttrs>(ATTRS);
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  device_handle_t handle = acc.get_argument<device_handle_t>(HANDLE);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  positive_int out_dim = output.shape.at(ff_dim_t{0_n});
-  positive_int batch_size = output.shape.at(ff_dim_t{1_n});
-
-  float *one_ptr;
-
-  LinearPerDeviceState per_device_state =
-      init_kernel(handle,
-                  one_ptr,
-                  attrs.activation,
-                  attrs.regularizer,
-                  attrs.use_bias,
-                  input.data_type,
-                  weight.data_type,
-                  output.data_type,
-                  batch_size.int_from_positive_int(),
-                  attrs.out_channels.int_from_positive_int());
+  positive_int out_dim = dim_at_idx(output.shape.dims, ff_dim_t{0_n});
+  positive_int batch_size = dim_at_idx(output.shape.dims, ff_dim_t{1_n});
+
+  std::optional<LinearPerDeviceState> per_device_state =
+      linear_init_kernel(kernel_device_type,
+                         handle,
+                         attrs.activation,
+                         attrs.regularizer,
+                         attrs.use_bias,
+                         input.shape.data_type,
+                         weight.shape.data_type,
+                         output.shape.data_type,
+                         batch_size.int_from_positive_int(),
+                         attrs.out_channels.int_from_positive_int());
+
   return DeviceSpecificDeviceStates{
-      DeviceSpecific<LinearPerDeviceState>::create(per_device_state)};
+      DeviceSpecific<std::optional<LinearPerDeviceState>>::create(
+          per_device_state),
+  };
 }
 
 static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
@@ -92,31 +106,27 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
   auto per_device_state =
-      acc.get_argument<LinearPerDeviceState>(PER_DEVICE_STATE);
+      acc.get_argument<std::optional<LinearPerDeviceState>>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto attrs = acc.get_argument<LinearAttrs>(ATTRS);
 
-  positive_int in_dim = input.shape.at(ff_dim_t{0_n});
-  positive_int out_dim = output.shape.at(ff_dim_t{0_n});
-  positive_int batch_size = positive_int{output.shape.num_elements() / out_dim};
-
-  float const *bias_ptr = NULL;
+  std::optional<GenericTensorAccessorR> bias = std::nullopt;
   if (attrs.use_bias) {
-    auto bias = acc.get_tensor<Permissions::RO>(BIAS);
-    bias_ptr = bias.get_float_ptr();
+    bias = acc.get_tensor<Permissions::RO>(BIAS);
   }
 
-  auto result = profile(forward_kernel,
-                 profiling,
-                 "[Linear] forward_time = {:.2lf}ms\n",
-                 per_device_state,
-                 input.get_float_ptr(),
-                 output.get_float_ptr(),
-                 weight.get_float_ptr(),
-                 bias_ptr,
-                 in_dim.int_from_positive_int(),
-                 out_dim.int_from_positive_int(),
-                 batch_size.int_from_positive_int());
+  auto result = profile(linear_forward_kernel,
+                        profiling,
+                        kernel_device_type,
+                        "[Linear] forward_time = {:.2lf}ms\n",
+                        per_device_state,
+                        attrs,
+                        input,
+                        output,
+                        weight,
+                        bias);
 
   return result;
 }
@@ -132,34 +142,30 @@ static std::optional<float>
   auto output_grad = acc.get_tensor_grad<Permissions::RW>(OUTPUT);
 
   auto per_device_state =
-      acc.get_argument<LinearPerDeviceState>(PER_DEVICE_STATE);
+      acc.get_argument<std::optional<LinearPerDeviceState>>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto attrs = acc.get_argument<LinearAttrs>(ATTRS);
 
-  float *bias_grad_ptr = NULL;
+  std::optional<GenericTensorAccessorW> bias_grad = std::nullopt;
   if (attrs.use_bias) {
-    auto bias_grad = acc.get_tensor_grad<Permissions::RW>(BIAS);
-    bias_grad_ptr = bias_grad.get_float_ptr();
+    bias_grad = acc.get_tensor<Permissions::RW>(BIAS);
   }
 
-  positive_int in_dim = input.shape.at(ff_dim_t{0_n});
-  positive_int out_dim = output.shape.at(ff_dim_t{0_n});
-  positive_int batch_size = positive_int{output.shape.num_elements() / out_dim};
-
-  auto result = profile(backward_kernel,
-                 profiling,
-                 "[Linear] backward_time = {:.2lf}ms\n",
-                 per_device_state,
-                 output.get_float_ptr(),
-                 output_grad.get_float_ptr(),
-                 input.get_float_ptr(),
-                 input_grad.get_float_ptr(),
-                 weight.get_float_ptr(),
-                 weight_grad.get_float_ptr(),
-                 bias_grad_ptr,
-                 in_dim.int_from_positive_int(),
-                 out_dim.int_from_positive_int(),
-                 batch_size.int_from_positive_int());
+  auto result = profile(linear_backward_kernel,
+                        profiling,
+                        kernel_device_type,
+                        "[Linear] backward_time = {:.2lf}ms\n",
+                        per_device_state,
+                        attrs,
+                        output,
+                        output_grad,
+                        input,
+                        input_grad,
+                        weight,
+                        weight_grad,
+                        bias_grad);
 
   return result;
 }
@@ -167,9 +173,11 @@ static std::optional<float>
 TaskImplFunction get_linear_init_task_impl() {
   return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
 }
+
 TaskImplFunction get_linear_fwd_task_impl() {
   return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
+
 TaskImplFunction get_linear_bwd_task_impl() {
   return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
@@ -182,6 +190,7 @@ OpTaskSignature get_linear_init_signature() {
   init.add_output_slot(OUTPUT);
 
   init.add_arg_slot<LinearAttrs>(ATTRS);
+  init.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
   init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
 
   init.add_return_value<LinearPerDeviceState>();
@@ -197,6 +206,7 @@ OpTaskSignature get_linear_fwd_signature() {
   fwd.add_output_slot(OUTPUT);
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
+  fwd.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
   fwd.add_arg_slot<LinearAttrs>(ATTRS);
   fwd.add_unchecked_arg_slot<LinearPerDeviceState>(PER_DEVICE_STATE);
   return fwd;
diff --git a/lib/task-spec/src/task-spec/ops/pool_2d.cc b/lib/task-spec/src/task-spec/ops/pool_2d.cc
index bceced61d3..20707acb2d 100644
--- a/lib/task-spec/src/task-spec/ops/pool_2d.cc
+++ b/lib/task-spec/src/task-spec/ops/pool_2d.cc
@@ -1,6 +1,7 @@
 #include "task-spec/ops/pool_2d.h"
 #include "kernels/pool_2d_kernels.h"
 #include "op-attrs/ops/pool_2d.h"
+#include "task-spec/profiling.h"
 #include "utils/exception.h"
 #include "utils/hash-utils.h"
 
@@ -8,16 +9,28 @@ using namespace FlexFlow::Kernels::Pool2D;
 
 namespace FlexFlow {
 
-enum Slots { INPUT, OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE, HANDLE };
+enum Slots {
+  INPUT,
+  OUTPUT,
+  ATTRS,
+  PROFILING,
+  PER_DEVICE_STATE,
+  HANDLE,
+  KERNEL_DEVICE_TYPE
+};
 
 OpTaskInvocation init(Pool2DAttrs const &attrs) {
   OpTaskBinding binding;
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
+  binding.bind(INPUT, input_tensor(0_n));
+  binding.bind(OUTPUT, output_tensor(0_n));
   binding.bind_arg(ATTRS, attrs);
   binding.bind_arg(HANDLE, ff_handle());
+  binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
 
-  return {task_id_t::POOL2D_INIT_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::POOL2D_INIT_TASK_ID,
+      binding,
+  };
 }
 
 static nonnegative_int calculate_padding(nonnegative_int output_size,
@@ -37,22 +50,25 @@ static nonnegative_int calculate_padding(nonnegative_int output_size,
 static DeviceSpecificDeviceStates
     init_task_impl(TaskArgumentAccessor const &acc) {
   auto const &attrs = acc.get_argument<Pool2DAttrs>(ATTRS);
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  device_handle_t handle = acc.get_argument<device_handle_t>(HANDLE);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
-  positive_int input_w = input.shape.at(ff_dim_t{0_n});
-  positive_int input_h = input.shape.at(ff_dim_t{1_n});
-  positive_int input_c = input.shape.at(ff_dim_t{2_n});
-  positive_int input_n = input.shape.at(ff_dim_t{3_n});
-  positive_int output_w = output.shape.at(ff_dim_t{0_n});
-  positive_int output_h = output.shape.at(ff_dim_t{1_n});
-  positive_int output_c = output.shape.at(ff_dim_t{2_n});
-  positive_int output_n = output.shape.at(ff_dim_t{3_n});
-
-  Pool2DPerDeviceState per_device_state =
-      init_kernel(handle,
+  positive_int input_w = dim_at_idx(input.shape.dims, ff_dim_t{0_n});
+  positive_int input_h = dim_at_idx(input.shape.dims, ff_dim_t{1_n});
+  positive_int input_c = dim_at_idx(input.shape.dims, ff_dim_t{2_n});
+  positive_int input_n = dim_at_idx(input.shape.dims, ff_dim_t{3_n});
+  positive_int output_w = dim_at_idx(output.shape.dims, ff_dim_t{0_n});
+  positive_int output_h = dim_at_idx(output.shape.dims, ff_dim_t{1_n});
+  positive_int output_c = dim_at_idx(output.shape.dims, ff_dim_t{2_n});
+  positive_int output_n = dim_at_idx(output.shape.dims, ff_dim_t{3_n});
+
+  std::optional<Pool2DPerDeviceState> per_device_state =
+      init_kernel(kernel_device_type,
+                  handle,
                   attrs.activation,
                   input_w.int_from_positive_int(),
                   input_h.int_from_positive_int(),
@@ -71,29 +87,40 @@ static DeviceSpecificDeviceStates
                   attrs.pool_type);
 
   return DeviceSpecificDeviceStates{
-      DeviceSpecific<Pool2DPerDeviceState>::create(per_device_state)};
+      DeviceSpecific<std::optional<Pool2DPerDeviceState>>::create(
+          per_device_state),
+  };
 }
 
 OpTaskInvocation forward(Pool2DAttrs const &attrs) {
   OpTaskBinding binding;
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
+  binding.bind(INPUT, input_tensor(0_n));
+  binding.bind(OUTPUT, output_tensor(0_n));
 
   binding.bind_arg(PROFILING, profiling_settings());
+  binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
   binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_op_state<Pool2DPerDeviceState>());
+                   per_device_op_state<std::optional<Pool2DPerDeviceState>>());
 
-  return {task_id_t::POOL2D_FWD_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::POOL2D_FWD_TASK_ID,
+      binding,
+  };
 }
 
 OpTaskInvocation backward(Pool2DAttrs const &attrs) {
   OpTaskBinding b = infer_bwd_binding(forward(attrs).binding);
 
-  return {task_id_t::POOL2D_BWD_TASK_ID, b};
+  return OpTaskInvocation{
+      task_id_t::POOL2D_BWD_TASK_ID,
+      b,
+  };
 }
 
 static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   Pool2DPerDeviceState state =
       acc.get_argument<Pool2DPerDeviceState>(PER_DEVICE_STATE);
 
@@ -102,6 +129,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 
   return profile(forward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[Pool2D] forward_time = {:.2lf}ms\n",
                  state,
                  input.get_float_ptr(),
@@ -111,6 +139,8 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   Pool2DPerDeviceState state =
       acc.get_argument<Pool2DPerDeviceState>(PER_DEVICE_STATE);
 
@@ -121,6 +151,7 @@ static std::optional<float>
 
   return profile(backward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[Pool2D] backward_time = {:.2lf}ms\n",
                  state,
                  output.get_float_ptr(),
@@ -132,9 +163,11 @@ static std::optional<float>
 TaskImplFunction get_pool_2d_init_task_impl() {
   return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
 }
+
 TaskImplFunction get_pool_2d_fwd_task_impl() {
   return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
+
 TaskImplFunction get_pool_2d_bwd_task_impl() {
   return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
@@ -146,21 +179,25 @@ OpTaskSignature get_pool_2d_init_signature() {
   init.add_output_slot(OUTPUT);
 
   init.add_arg_slot<Pool2DAttrs>(ATTRS);
-  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
+  init.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
+  init.add_unchecked_arg_slot<device_handle_t>(HANDLE);
 
   init.add_return_value<FlexFlow::Pool2DPerDeviceState>();
   return init;
 }
+
 OpTaskSignature get_pool_2d_fwd_signature() {
   OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
+  fwd.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
 
   fwd.add_unchecked_arg_slot<Pool2DPerDeviceState>(PER_DEVICE_STATE);
   return fwd;
 }
+
 OpTaskSignature get_pool_2d_bwd_signature() {
   OpTaskSignature bwd = infer_bwd_signature(get_pool_2d_fwd_signature());
   return bwd;
diff --git a/lib/task-spec/src/task-spec/ops/reduce.cc b/lib/task-spec/src/task-spec/ops/reduce.cc
index 3efac36c3f..d8818393ec 100644
--- a/lib/task-spec/src/task-spec/ops/reduce.cc
+++ b/lib/task-spec/src/task-spec/ops/reduce.cc
@@ -1,5 +1,6 @@
 #include "task-spec/ops/reduce.h"
 #include "kernels/reduce_kernels.h"
+#include "task-spec/profiling.h"
 #include "utils/exception.h"
 #include "utils/hash-utils.h"
 #include "utils/type_traits_core.h"
@@ -15,24 +16,31 @@ enum Slots {
   PROFILING,
   REDUCE,
   PER_DEVICE_STATE,
-  HANDLE
+  HANDLE,
+  KERNEL_DEVICE_TYPE,
 };
 
 OpTaskInvocation init(ReduceAttrs const &attrs) {
   OpTaskBinding binding;
 
   binding.bind_arg(HANDLE, ff_handle());
+  binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
   binding.bind_arg(ATTRS, attrs);
 
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
+  binding.bind(INPUT, input_tensor(0_n));
+  binding.bind(OUTPUT, output_tensor(0_n));
 
-  return {task_id_t::REDUCE_INIT_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::REDUCE_INIT_TASK_ID,
+      binding,
+  };
 }
 
 static DeviceSpecificDeviceStates
     init_task_impl(TaskArgumentAccessor const &acc) {
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  device_handle_t handle = acc.get_argument<device_handle_t>(HANDLE);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto attrs = acc.get_argument<ReduceAttrs>(ATTRS);
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
@@ -40,15 +48,20 @@ static DeviceSpecificDeviceStates
   OperatorType op_type = attrs.op_type;
 
   nonnegative_int reduction_size =
-      input.shape.num_elements() / output.shape.num_elements();
-  ReducePerDeviceState per_device_state =
-      init_kernel(handle,
+      get_num_elements(input.shape.dims) / get_num_elements(output.shape.dims);
+
+  std::optional<ReducePerDeviceState> per_device_state =
+      init_kernel(kernel_device_type,
+                  handle,
                   op_type,
                   reduction_size.unwrap_nonnegative(),
                   input.shape,
                   output.shape);
+
   return DeviceSpecificDeviceStates{
-      DeviceSpecific<ReducePerDeviceState>::create(per_device_state)};
+      DeviceSpecific<std::optional<ReducePerDeviceState>>::create(
+          per_device_state),
+  };
 }
 
 // Note: forward_kernel only needs ReducePerDeviceState, input, output
@@ -56,25 +69,32 @@ OpTaskInvocation forward(ReduceAttrs const &attrs) {
   OpTaskBinding binding;
 
   binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_op_state<ReducePerDeviceState>());
+                   per_device_op_state<std::optional<ReducePerDeviceState>>());
   binding.bind_arg(PROFILING, profiling_settings());
+  binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
 
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
+  binding.bind(INPUT, input_tensor(0_n));
+  binding.bind(OUTPUT, output_tensor(0_n));
 
-  return {task_id_t::REDUCE_FWD_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::REDUCE_FWD_TASK_ID,
+      binding,
+  };
 }
 
 static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto per_device_state =
       acc.get_argument<ReducePerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
   return profile(forward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[Reduce] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  input.get_float_ptr(),
@@ -84,7 +104,10 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 OpTaskInvocation backward(ReduceAttrs const &attrs) {
   OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
 
-  return {task_id_t::REDUCE_BWD_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::REDUCE_BWD_TASK_ID,
+      binding,
+  };
 }
 
 static std::optional<float>
@@ -92,12 +115,15 @@ static std::optional<float>
   auto per_device_state =
       acc.get_argument<ReducePerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
 
   auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
 
   return profile(backward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[Reduce] backward_time = {:.2lf}ms\n",
                  per_device_state,
                  output_grad.get_float_ptr(),
@@ -107,9 +133,11 @@ static std::optional<float>
 TaskImplFunction get_reduce_init_task_impl() {
   return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
 }
+
 TaskImplFunction get_reduce_fwd_task_impl() {
   return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
+
 TaskImplFunction get_reduce_bwd_task_impl() {
   return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
@@ -117,22 +145,26 @@ TaskImplFunction get_reduce_bwd_task_impl() {
 OpTaskSignature get_reduce_init_signature() {
   OpTaskSignature init(OpTaskType::INIT);
 
-  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
+  init.add_unchecked_arg_slot<device_handle_t>(HANDLE);
   init.add_arg_slot<ReduceAttrs>(ATTRS);
+  init.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
 
   init.add_return_value<ReducePerDeviceState>();
   return init;
 }
+
 OpTaskSignature get_reduce_fwd_signature() {
   OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_unchecked_arg_slot<ReducePerDeviceState>(PER_DEVICE_STATE);
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
+  fwd.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
 
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
   return fwd;
 }
+
 OpTaskSignature get_reduce_bwd_signature() {
   OpTaskSignature bwd = infer_bwd_signature(get_reduce_fwd_signature());
   return bwd;
diff --git a/lib/task-spec/src/task-spec/ops/reduction.cc b/lib/task-spec/src/task-spec/ops/reduction.cc
deleted file mode 100644
index 48f4c0e98d..0000000000
--- a/lib/task-spec/src/task-spec/ops/reduction.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "task-spec/ops/reduction.h"
-#include "kernels/reduction_kernels.h"
-#include "utils/exception.h"
-#include "utils/hash-utils.h"
-
-namespace FlexFlow {
-
-using namespace FlexFlow::Kernels::Reduction;
-
-enum Slots { INPUT, OUTPUT, ATTRS, PROFILING };
-
-OpTaskInvocation forward(ReductionAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(PROFILING, profiling_settings());
-  binding.bind_arg(ATTRS, attrs);
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-
-  return {task_id_t::REDUCTION_FWD_TASK_ID, binding};
-}
-
-OpTaskInvocation backward(ReductionAttrs const &attrs) {
-  OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::REDUCTION_BWD_TASK_ID, binding};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling_settings =
-      acc.get_argument<ProfilingSettings>(PROFILING);
-
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  auto attrs = acc.get_argument<ReductionAttrs>(ATTRS);
-
-  positive_int num_replicas = attrs.reduction_degree;
-
-  return profile(forward_kernel,
-                 profiling_settings,
-                 "[Reduction] forward_time = {:.2lf}ms\n",
-                 input,
-                 output,
-                 num_replicas.int_from_positive_int());
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-
-  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
-  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
-  return profile(backward_kernel,
-                 profiling,
-                 "[Reduction] backward_time = {:.2lf}ms\n",
-                 output_grad,
-                 input_grad);
-}
-
-TaskImplFunction get_reduction_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-TaskImplFunction get_reduction_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-OpTaskSignature get_reduction_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
-  fwd.add_arg_slot<ReductionAttrs>(ATTRS);
-
-  fwd.add_input_slot(INPUT);
-  fwd.add_output_slot(OUTPUT);
-  return fwd;
-}
-OpTaskSignature get_reduction_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_reduction_fwd_signature());
-  return bwd;
-}
-
-std::vector<task_id_t> get_task_ids(ReductionAttrs const &) {
-  return {task_id_t::REDUCTION_FWD_TASK_ID, task_id_t::REDUCTION_BWD_TASK_ID};
-}
-
-}; // namespace FlexFlow
diff --git a/lib/task-spec/src/task-spec/ops/repartition.cc b/lib/task-spec/src/task-spec/ops/repartition.cc
deleted file mode 100644
index cfc45dede7..0000000000
--- a/lib/task-spec/src/task-spec/ops/repartition.cc
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "task-spec/ops/repartition.h"
-#include "kernels/partition_kernels.h"
-#include "utils/exception.h"
-#include "utils/hash-utils.h"
-
-namespace FlexFlow {
-
-using namespace FlexFlow::Kernels::Repartition;
-
-enum Slots { INPUT, OUTPUT, ATTRS, PROFILING, HANDLE, PER_DEVICE_STATE };
-
-OpTaskInvocation init(RepartitionAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(HANDLE, ff_handle());
-  binding.bind(INPUT, input_tensor(0));
-
-  return {task_id_t::REPARTITION_INIT_TASK_ID, binding};
-}
-
-OpTaskInvocation forward(RepartitionAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(PROFILING, profiling_settings());
-  binding.bind_arg(ATTRS, attrs);
-  binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_op_state<RepartitionPerDeviceState>());
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-
-  return {task_id_t::REPARTITION_FWD_TASK_ID, binding};
-}
-
-OpTaskInvocation backward(RepartitionAttrs const &attrs) {
-  OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::REPARTITION_BWD_TASK_ID, binding};
-}
-
-static DeviceSpecificDeviceStates
-    init_task_impl(TaskArgumentAccessor const &acc) {
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
-
-  // Note: use the input data type
-
-  RepartitionPerDeviceState per_device_state =
-      init_kernel(handle, input.data_type);
-  return DeviceSpecificDeviceStates{
-      DeviceSpecific<RepartitionPerDeviceState>::create(per_device_state)};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto per_device_state =
-      acc.get_argument<RepartitionPerDeviceState>(PER_DEVICE_STATE);
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-
-  return profile(forward_kernel,
-                 profiling,
-                 "[Reparition/Partition] forward_time = {:.2lf}ms\n",
-                 per_device_state,
-                 input,
-                 output);
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto per_device_state =
-      acc.get_argument<RepartitionPerDeviceState>(PER_DEVICE_STATE);
-  auto output_grad = acc.get_tensor_grad<Permissions::RO>(INPUT);
-  auto input_grad = acc.get_tensor_grad<Permissions::WO>(OUTPUT);
-
-  return profile(backward_kernel,
-                 profiling,
-                 "[Reparition/Partition] backward_time = {:.2lf}ms\n",
-                 per_device_state,
-                 output_grad,
-                 input_grad);
-}
-
-TaskImplFunction get_repartition_init_task_impl() {
-  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
-}
-TaskImplFunction get_repartition_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-TaskImplFunction get_repartition_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-OpTaskSignature get_repartition_init_signature() {
-  OpTaskSignature init(OpTaskType::INIT);
-
-  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
-  init.add_input_slot(INPUT);
-  init.add_return_value<RepartitionPerDeviceState>();
-  return init;
-}
-OpTaskSignature get_repartition_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_input_slot(INPUT);
-  fwd.add_output_slot(OUTPUT);
-  fwd.add_arg_slot<ProfilingSettings>(PROFILING);
-  fwd.add_unchecked_arg_slot<RepartitionPerDeviceState>(PER_DEVICE_STATE);
-  return fwd;
-}
-OpTaskSignature get_repartition_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_repartition_fwd_signature());
-  return bwd;
-}
-
-std::vector<task_id_t> get_task_ids(RepartitionAttrs const &) {
-  return {task_id_t::REPARTITION_INIT_TASK_ID,
-          task_id_t::REPARTITION_FWD_TASK_ID,
-          task_id_t::REPARTITION_BWD_TASK_ID};
-}
-
-}; // namespace FlexFlow
diff --git a/lib/task-spec/src/task-spec/ops/replicate.cc b/lib/task-spec/src/task-spec/ops/replicate.cc
deleted file mode 100644
index e91414bc16..0000000000
--- a/lib/task-spec/src/task-spec/ops/replicate.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "task-spec/ops/replicate.h"
-#include "kernels/replicate_kernels.h"
-#include "op-attrs/parallel_tensor_shape.h"
-#include "utils/exception.h"
-#include "utils/hash-utils.h"
-
-namespace FlexFlow {
-
-using namespace FlexFlow::Kernels::Replicate;
-
-enum Slots { INPUT, OUTPUT, ATTRS, PROFILING };
-
-OpTaskInvocation forward(ReplicateAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(PROFILING, profiling_settings());
-
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-  binding.bind_arg(ATTRS, attrs);
-
-  return {task_id_t::REPLICATE_FWD_TASK_ID, binding};
-}
-OpTaskInvocation backward(ReplicateAttrs const &attrs) {
-  OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
-
-  return {task_id_t::REPLICATE_BWD_TASK_ID, binding};
-}
-
-static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-
-  auto input = acc.get_tensor<Permissions::RO>(INPUT);
-  auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-
-  return profile(forward_kernel,
-                 profiling,
-                 "[replicate] forward_time = {:.2lf}ms\n",
-                 input,
-                 output);
-}
-
-static std::optional<float>
-    backward_task_impl(TaskArgumentAccessor const &acc) {
-  ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-
-  auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
-  auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
-  auto attrs = acc.get_argument<ReplicateAttrs>(ATTRS);
-
-  return profile(backward_kernel,
-                 profiling,
-                 "[replicate] backward_time = {:.2lf}ms\n",
-                 output_grad,
-                 input_grad,
-                 attrs.replicate_degree.int_from_positive_int());
-}
-
-TaskImplFunction get_replicate_fwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
-}
-TaskImplFunction get_replicate_bwd_task_impl() {
-  return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
-}
-
-OpTaskSignature get_replicate_fwd_signature() {
-  OpTaskSignature fwd(OpTaskType::FWD);
-
-  fwd.add_arg_slot<bool>(PROFILING);
-  fwd.add_input_slot(INPUT);
-  fwd.add_output_slot(OUTPUT);
-  return fwd;
-}
-
-OpTaskSignature get_replicate_bwd_signature() {
-  OpTaskSignature bwd = infer_bwd_signature(get_replicate_fwd_signature());
-  return bwd;
-}
-
-std::vector<task_id_t> get_task_ids(ReplicateAttrs const &) {
-  return {task_id_t::REPLICATE_FWD_TASK_ID, task_id_t::REPLICATE_BWD_TASK_ID};
-}
-
-}; // namespace FlexFlow
diff --git a/lib/task-spec/src/task-spec/ops/reshape.cc b/lib/task-spec/src/task-spec/ops/reshape.cc
index 0b43f3e31f..b6d8cabd82 100644
--- a/lib/task-spec/src/task-spec/ops/reshape.cc
+++ b/lib/task-spec/src/task-spec/ops/reshape.cc
@@ -15,118 +15,98 @@
 
 #include "task-spec/ops/reshape.h"
 #include "kernels/reshape_kernels.h"
+#include "task-spec/profiling.h"
 
 namespace FlexFlow {
 
 using namespace FlexFlow::Kernels::Reshape;
 
-enum slots { INPUT, OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE };
-
-OpTaskInvocation init(ReshapeAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(ATTRS, attrs);
-
-  return {task_id_t::RESHAPE_INIT_TASK_ID, binding};
-}
+enum Slots { INPUT, OUTPUT, ATTRS, PROFILING, KERNEL_DEVICE_TYPE };
 
 OpTaskInvocation forward(ReshapeAttrs const &attrs) {
   OpTaskBinding binding;
 
-  binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_op_state<ReshapePerDeviceState>());
   binding.bind_arg(PROFILING, profiling_settings());
+  binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
+  binding.bind_arg(ATTRS, attrs);
 
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-  return {task_id_t::RESHAPE_FWD_TASK_ID, binding};
+  binding.bind(INPUT, input_tensor(0_n));
+  binding.bind(OUTPUT, output_tensor(0_n));
+  return OpTaskInvocation{
+      task_id_t::RESHAPE_FWD_TASK_ID,
+      binding,
+  };
 }
 
 OpTaskInvocation backward(ReshapeAttrs const &attrs) {
   OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
 
-  return {task_id_t::RESHAPE_BWD_TASK_ID, binding};
-}
-
-static DeviceSpecificDeviceStates
-    init_task_impl(TaskArgumentAccessor const &acc) {
-  auto attrs = acc.get_argument<ReshapeAttrs>(ATTRS);
-
-  ReshapePerDeviceState per_device_state = init_kernel(attrs.shape.data_type);
-  return DeviceSpecificDeviceStates{
-      DeviceSpecific<ReshapePerDeviceState>::create(per_device_state)};
+  return OpTaskInvocation{
+      task_id_t::RESHAPE_BWD_TASK_ID,
+      binding,
+  };
 }
 
 static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
-  auto per_device_state =
-      acc.get_argument<ReshapePerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
+  ReshapeAttrs attrs = acc.get_argument<ReshapeAttrs>(ATTRS);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
   return profile(forward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[Reshape] forward time = {:.2lf}ms\n",
-                 per_device_state,
                  input,
                  output);
 }
 
 static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
-  auto per_device_state =
-      acc.get_argument<ReshapePerDeviceState>(PER_DEVICE_STATE);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
+  ReshapeAttrs attrs = acc.get_argument<ReshapeAttrs>(ATTRS);
 
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
 
   return profile(backward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[Reshape] backward time = {:.2lf}ms\n",
-                 per_device_state,
                  output_grad,
                  input_grad);
 }
 
-TaskImplFunction get_reshape_init_task_impl() {
-  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
-}
 TaskImplFunction get_reshape_fwd_task_impl() {
   return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
+
 TaskImplFunction get_reshape_bwd_task_impl() {
   return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
-OpTaskSignature get_reshape_init_signature() {
-  OpTaskSignature init(OpTaskType::INIT);
-
-  init.add_arg_slot<ReshapeAttrs>(ATTRS);
-
-  init.add_return_value<ReshapePerDeviceState>();
-  return init;
-}
 OpTaskSignature get_reshape_fwd_signature() {
   OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
-  fwd.add_unchecked_arg_slot<ReshapePerDeviceState>(PER_DEVICE_STATE);
 
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
   return fwd;
 }
+
 OpTaskSignature get_reshape_bwd_signature() {
   OpTaskSignature bwd = infer_bwd_signature(get_reshape_fwd_signature());
   return bwd;
 }
 
 std::vector<task_id_t> get_task_ids(ReshapeAttrs const &) {
-  return {task_id_t::RESHAPE_INIT_TASK_ID,
-          task_id_t::RESHAPE_FWD_TASK_ID,
-          task_id_t::RESHAPE_BWD_TASK_ID};
+  return {task_id_t::RESHAPE_FWD_TASK_ID, task_id_t::RESHAPE_BWD_TASK_ID};
 }
 
 }; // namespace FlexFlow
diff --git a/lib/task-spec/src/task-spec/ops/reverse.cc b/lib/task-spec/src/task-spec/ops/reverse.cc
index 41739d086e..9d1a8e1753 100644
--- a/lib/task-spec/src/task-spec/ops/reverse.cc
+++ b/lib/task-spec/src/task-spec/ops/reverse.cc
@@ -16,6 +16,7 @@
 #include "task-spec/ops/reverse.h"
 #include "kernels/accessor.h"
 #include "kernels/reverse_kernels.h"
+#include "task-spec/profiling.h"
 #include "utils/nonnegative_int/nonnegative_range.h"
 
 namespace FlexFlow {
@@ -23,33 +24,43 @@ namespace FlexFlow {
 using namespace FlexFlow::Kernels::Reverse;
 using coord_t = long long;
 
-enum Slots { INPUT, OUTPUT, ATTRS, PROFILING };
+enum Slots { INPUT, OUTPUT, ATTRS, PROFILING, KERNEL_DEVICE_TYPE };
 
 OpTaskInvocation forward(ReverseAttrs const &attrs) {
   OpTaskBinding binding;
 
   binding.bind_arg(PROFILING, profiling_settings());
+  binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
   binding.bind_arg(ATTRS, attrs);
 
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
+  binding.bind(INPUT, input_tensor(0_n));
+  binding.bind(OUTPUT, output_tensor(0_n));
 
-  return {task_id_t::REVERSE_FWD_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::REVERSE_FWD_TASK_ID,
+      binding,
+  };
 }
 OpTaskInvocation backward(ReverseAttrs const &attrs) {
   OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
 
-  return {task_id_t::REVERSE_BWD_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::REVERSE_BWD_TASK_ID,
+      binding,
+  };
 }
 
 static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto attrs = acc.get_argument<ReverseAttrs>(ATTRS);
 
   return profile(forward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[reverse] forward_time = {:.2lf}ms\n",
                  input,
                  output,
@@ -59,12 +70,15 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
   auto attrs = acc.get_argument<ReverseAttrs>(ATTRS);
 
   return profile(backward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[reverse] backward_time = {:.2lf}ms\n",
                  output_grad,
                  input_grad,
@@ -82,6 +96,7 @@ OpTaskSignature get_reverse_fwd_signature() {
   OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
+  fwd.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
   return fwd;
diff --git a/lib/task-spec/src/task-spec/ops/softmax.cc b/lib/task-spec/src/task-spec/ops/softmax.cc
index 81239d1a67..89ea42299f 100644
--- a/lib/task-spec/src/task-spec/ops/softmax.cc
+++ b/lib/task-spec/src/task-spec/ops/softmax.cc
@@ -16,75 +16,104 @@
 #include "task-spec/ops/softmax.h"
 #include "kernels/softmax_kernels.h"
 #include "op-attrs/parallel_tensor_shape.h"
+#include "task-spec/profiling.h"
 #include "utils/exception.h"
 #include "utils/hash-utils.h"
 
 namespace FlexFlow {
 using namespace FlexFlow::Kernels::Softmax;
 
-enum Slots { INPUT, OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE, HANDLE };
+enum Slots {
+  INPUT,
+  OUTPUT,
+  ATTRS,
+  PROFILING,
+  PER_DEVICE_STATE,
+  HANDLE,
+  KERNEL_DEVICE_TYPE
+};
 
 OpTaskInvocation init(SoftmaxAttrs const &attrs) {
   OpTaskBinding binding;
 
   binding.bind_arg(HANDLE, ff_handle());
   binding.bind_arg(ATTRS, attrs);
-  return {task_id_t::SOFTMAX_INIT_TASK_ID, binding};
+  binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
+
+  return OpTaskInvocation{
+      task_id_t::SOFTMAX_INIT_TASK_ID,
+      binding,
+  };
 }
 
 OpTaskInvocation forward(SoftmaxAttrs const &attrs) {
   OpTaskBinding binding;
 
   binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_op_state<SoftmaxPerDeviceState>());
+                   per_device_op_state<std::optional<SoftmaxPerDeviceState>>());
   binding.bind_arg(PROFILING, profiling_settings());
+  binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
 
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
+  binding.bind(INPUT, input_tensor(0_n));
+  binding.bind(OUTPUT, output_tensor(0_n));
 
-  return {task_id_t::SOFTMAX_FWD_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::SOFTMAX_FWD_TASK_ID,
+      binding,
+  };
 }
 
 OpTaskInvocation backward(SoftmaxAttrs const &attrs) {
   OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
 
-  return {task_id_t::SOFTMAX_BWD_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::SOFTMAX_BWD_TASK_ID,
+      binding,
+  };
 }
 
 static DeviceSpecificDeviceStates
     init_task_impl(TaskArgumentAccessor const &acc) {
-  PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
+  device_handle_t handle = acc.get_argument<device_handle_t>(HANDLE);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
 
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto const &attrs = acc.get_argument<SoftmaxAttrs>(ATTRS);
 
-  positive_int output_w = output.shape.at(legion_dim_t{0_n});
-  positive_int output_h = output.shape.at(legion_dim_t{1_n});
-  positive_int output_c = output.shape.at(legion_dim_t{2_n});
-  positive_int output_n = output.shape.at(legion_dim_t{3_n});
+  positive_int output_w = dim_at_idx(output.shape.dims, legion_dim_t{0_n});
+  positive_int output_h = dim_at_idx(output.shape.dims, legion_dim_t{1_n});
+  positive_int output_c = dim_at_idx(output.shape.dims, legion_dim_t{2_n});
+  positive_int output_n = dim_at_idx(output.shape.dims, legion_dim_t{3_n});
 
-  SoftmaxPerDeviceState per_device_state =
-      init_kernel(handle,
-                  attrs.dim.value.unwrap_nonnegative(),
+  std::optional<SoftmaxPerDeviceState> per_device_state =
+      init_kernel(kernel_device_type,
+                  handle,
+                  attrs.dim,
                   output_n.int_from_positive_int(),
                   output_c.int_from_positive_int(),
                   output_h.int_from_positive_int(),
                   output_w.int_from_positive_int());
 
   return DeviceSpecificDeviceStates{
-      DeviceSpecific<SoftmaxPerDeviceState>::create(per_device_state)};
+      DeviceSpecific<std::optional<SoftmaxPerDeviceState>>::create(
+          per_device_state),
+  };
 }
 
 static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto per_device_state =
       acc.get_argument<SoftmaxPerDeviceState>(PER_DEVICE_STATE);
 
   return profile(forward_kernel,
                  profiling,
-                 "[SoftMax] forward_time = {:.2lf}ms\n",
+                 kernel_device_type,
+                 "[Softmax] forward_time = {:.2lf}ms\n",
                  per_device_state,
                  input.get_float_ptr(),
                  output.get_float_ptr());
@@ -93,6 +122,8 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
 
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
@@ -103,20 +134,24 @@ static std::optional<float>
 
   assert(output_grad.shape == output.shape);
 
-  return profile(backward_kernel,
-                 profiling,
-                 "[SoftMax] backward_time = {:.2lf}ms\n",
-                 output_grad.get_float_ptr(),
-                 input_grad.get_float_ptr(),
-                 output_grad.shape.num_elements().int_from_positive_int());
+  return profile(
+      backward_kernel,
+      profiling,
+      kernel_device_type,
+      "[Softmax] backward_time = {:.2lf}ms\n",
+      output_grad.get_float_ptr(),
+      input_grad.get_float_ptr(),
+      get_num_elements(output_grad.shape.dims).int_from_positive_int());
 }
 
 TaskImplFunction get_softmax_init_task_impl() {
   return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
 }
+
 TaskImplFunction get_softmax_fwd_task_impl() {
   return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
+
 TaskImplFunction get_softmax_bwd_task_impl() {
   return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
@@ -124,21 +159,25 @@ TaskImplFunction get_softmax_bwd_task_impl() {
 OpTaskSignature get_softmax_init_signature() {
   OpTaskSignature init(OpTaskType::INIT);
 
-  init.add_unchecked_arg_slot<PerDeviceFFHandle>(HANDLE);
+  init.add_unchecked_arg_slot<device_handle_t>(HANDLE);
   init.add_arg_slot<SoftmaxAttrs>(ATTRS);
+  init.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
   init.add_return_value<SoftmaxPerDeviceState>();
   return init;
 }
+
 OpTaskSignature get_softmax_fwd_signature() {
   OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
+  fwd.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
   fwd.add_unchecked_arg_slot<SoftmaxPerDeviceState>(PER_DEVICE_STATE);
 
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
   return fwd;
 }
+
 OpTaskSignature get_softmax_bwd_signature() {
   OpTaskSignature bwd = infer_bwd_signature(get_softmax_fwd_signature());
   return bwd;
diff --git a/lib/task-spec/src/task-spec/ops/split.cc b/lib/task-spec/src/task-spec/ops/split.cc
index 145a9b58a3..88c16be57c 100644
--- a/lib/task-spec/src/task-spec/ops/split.cc
+++ b/lib/task-spec/src/task-spec/ops/split.cc
@@ -14,8 +14,8 @@
  */
 
 #include "task-spec/ops/split.h"
-#include "kernels/array_shape.h"
 #include "kernels/split_kernels.h"
+#include "task-spec/profiling.h"
 #include "utils/exception.h"
 #include "utils/hash-utils.h"
 #include "utils/nonnegative_int/nonnegative_range.h"
@@ -23,37 +23,45 @@
 namespace FlexFlow {
 
 using namespace FlexFlow::Kernels::Split;
-using coord_t = long long;
 
-enum Slots { INPUT, OUTPUT, ATTRS, PROFILING };
+enum Slots { INPUT, OUTPUT, ATTRS, PROFILING, KERNEL_DEVICE_TYPE };
 
 OpTaskInvocation forward(SplitAttrs const &attrs) {
   OpTaskBinding binding;
 
   binding.bind_arg(PROFILING, profiling_settings());
   binding.bind_arg(ATTRS, attrs);
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
+  binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
 
-  return {task_id_t::SPLIT_FWD_TASK_ID, binding};
+  binding.bind(INPUT, input_tensor(0_n));
+  binding.bind(OUTPUT, output_tensor(0_n));
+
+  return OpTaskInvocation{
+      task_id_t::SPLIT_FWD_TASK_ID,
+      binding,
+  };
 }
 
 OpTaskInvocation backward(SplitAttrs const &attrs) {
   OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
 
-  return {task_id_t::SPLIT_BWD_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::SPLIT_BWD_TASK_ID,
+      binding,
+  };
 }
 
 static std::pair<positive_int, positive_int>
-    calc_block_size(ArrayShape const &array_shape, ff_dim_t axis) {
+    calc_block_size(TensorShape const &tensor_shape, ff_dim_t axis) {
   positive_int num_blocks = 1_p;
   positive_int block_size = 1_p;
-  for (nonnegative_int d : nonnegative_range(
-           array_shape.num_elements().nonnegative_int_from_positive_int())) {
+  for (nonnegative_int d :
+       nonnegative_range(get_num_elements(tensor_shape.dims)
+                             .nonnegative_int_from_positive_int())) {
     if (d <= axis.value) {
-      block_size *= array_shape.at(legion_dim_t{d});
+      block_size *= dim_at_idx(tensor_shape.dims, legion_dim_t{d});
     } else {
-      num_blocks *= array_shape.at(legion_dim_t{d});
+      num_blocks *= dim_at_idx(tensor_shape.dims, legion_dim_t{d});
     }
   }
   return {num_blocks, block_size};
@@ -61,11 +69,13 @@ static std::pair<positive_int, positive_int>
 
 static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto attrs = acc.get_argument<SplitAttrs>(ATTRS);
 
-  coord_t out_block_sizes[MAX_NUM_OUTPUTS];
+  int out_block_sizes[MAX_NUM_OUTPUTS];
   auto [num_blocks, in_block_size] = calc_block_size(input.shape, attrs.axis);
 
   for (int i = 0; i < attrs.splits.size(); i++) {
@@ -75,7 +85,8 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   float *output_float_ptr = output.get_float_ptr();
   return profile(forward_kernel,
                  profiling,
-                 "Split forward_time = {:.2lf}ms\n",
+                 kernel_device_type,
+                 "[Split] forward_time = {:.2lf}ms\n",
                  &output_float_ptr,
                  input.get_float_ptr(),
                  out_block_sizes,
@@ -88,23 +99,26 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
   auto attrs = acc.get_argument<SplitAttrs>(ATTRS);
 
-  coord_t out_block_sizes[MAX_NUM_OUTPUTS];
+  int out_block_sizes[MAX_NUM_OUTPUTS];
   auto [num_blocks, in_block_size] =
       calc_block_size(input_grad.shape, attrs.axis);
 
   for (int i = 0; i < attrs.splits.size(); i++) {
-    coord_t out_num_blocks;
+    int out_num_blocks;
     auto [_, out_block_size] = calc_block_size(output_grad.shape, attrs.axis);
     out_block_sizes[i] = out_block_size.int_from_positive_int();
   }
   float const *output_grad_ptr = output_grad.get_float_ptr();
   return profile(backward_kernel,
                  profiling,
-                 "Split backward_time = {:.2lf}ms\n",
+                 kernel_device_type,
+                 "[Split] backward_time = {:.2lf}ms\n",
                  input_grad.get_float_ptr(),
                  &output_grad_ptr,
                  out_block_sizes,
diff --git a/lib/task-spec/src/task-spec/ops/topk.cc b/lib/task-spec/src/task-spec/ops/topk.cc
index bdf92d8487..8ff275dac3 100644
--- a/lib/task-spec/src/task-spec/ops/topk.cc
+++ b/lib/task-spec/src/task-spec/ops/topk.cc
@@ -15,6 +15,7 @@
 
 #include "task-spec/ops/topk.h"
 #include "kernels/topk_kernels.h"
+#include "task-spec/profiling.h"
 #include "utils/exception.h"
 
 namespace FlexFlow {
@@ -25,63 +26,52 @@ using namespace FlexFlow::Kernels::TopK;
 // (resp. vector along the last dimension). Thus,
 // values.shape = indices.shape = input.shape[:-1] + [k]
 
-enum Slots { INPUT, OUTPUT, INDICES, ATTRS, PROFILING, PER_DEVICE_STATE };
-
-OpTaskInvocation init(TopKAttrs const &attrs) {
-  OpTaskBinding binding;
-
-  binding.bind_arg(ATTRS, attrs);
-
-  return {task_id_t::TOPK_INIT_TASK_ID, binding};
-}
+enum Slots { INPUT, OUTPUT, INDICES, ATTRS, PROFILING, KERNEL_DEVICE_TYPE };
 
 OpTaskInvocation forward(TopKAttrs const &attrs) {
   OpTaskBinding binding;
 
-  binding.bind_arg(PER_DEVICE_STATE, per_device_op_state<TopKPerDeviceState>());
   binding.bind_arg(PROFILING, profiling_settings());
   binding.bind_arg(ATTRS, attrs);
+  binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
 
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
-  binding.bind(INDICES, output_tensor(1));
+  binding.bind(INPUT, input_tensor(0_n));
+  binding.bind(OUTPUT, output_tensor(0_n));
+  binding.bind(INDICES, output_tensor(1_n));
 
-  return {task_id_t::TOPK_FWD_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::TOPK_FWD_TASK_ID,
+      binding,
+  };
 }
 
 OpTaskInvocation backward(TopKAttrs const &attrs) {
   OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
 
-  return {task_id_t::TOPK_BWD_TASK_ID, binding};
-}
-
-static DeviceSpecificDeviceStates
-    init_task_impl(TaskArgumentAccessor const &acc) {
-
-  auto attrs = acc.get_argument<TopKAttrs>(ATTRS);
-
-  TopKPerDeviceState per_device_state = init_kernel(attrs.sorted);
-  return DeviceSpecificDeviceStates{
-      DeviceSpecific<TopKPerDeviceState>::create(per_device_state)};
+  return OpTaskInvocation{
+      task_id_t::TOPK_BWD_TASK_ID,
+      binding,
+  };
 }
 
 static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto attrs = acc.get_argument<TopKAttrs>(ATTRS);
-  auto per_device_state =
-      acc.get_argument<TopKPerDeviceState>(PER_DEVICE_STATE);
   auto profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
-  positive_int length = input.shape.at(legion_dim_t{0_n});
-  positive_int batch_size = positive_int{input.shape.num_elements() / length};
+  positive_int length = dim_at_idx(input.shape.dims, legion_dim_t{0_n});
+  positive_int batch_size =
+      positive_int{get_num_elements(input.shape.dims) / length};
   auto indices = acc.get_tensor<Permissions::WO>(INDICES);
 
   return profile(forward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[TopK] forward_time = {:.2lf}ms\n",
-                 per_device_state,
                  input.get_float_ptr(),
                  output.get_float_ptr(),
                  indices.get_int32_ptr(),
@@ -94,23 +84,23 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
   auto attrs = acc.get_argument<TopKAttrs>(ATTRS);
-  auto per_device_state =
-      acc.get_argument<TopKPerDeviceState>(PER_DEVICE_STATE);
   auto profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
 
   auto input_grad = acc.get_tensor_grad<Permissions::RW>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
 
   auto indices = acc.get_tensor<Permissions::RO>(INDICES);
 
-  positive_int length = input_grad.shape.at(legion_dim_t{0_n});
+  positive_int length = dim_at_idx(input_grad.shape.dims, legion_dim_t{0_n});
   positive_int batch_size =
-      positive_int{input_grad.shape.num_elements() / length};
+      positive_int{get_num_elements(input_grad.shape.dims) / length};
 
   return profile(backward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[TopK] backward_time = {:.2lf}ms\n",
-                 per_device_state,
                  output_grad.get_float_ptr(),
                  indices.get_int32_ptr(),
                  input_grad.get_float_ptr(),
@@ -119,45 +109,34 @@ static std::optional<float>
                  attrs.k.int_from_positive_int());
 }
 
-TaskImplFunction get_topk_init_task_impl() {
-  return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}};
-}
 TaskImplFunction get_topk_fwd_task_impl() {
   return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}};
 }
+
 TaskImplFunction get_topk_bwd_task_impl() {
   return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}};
 }
 
-OpTaskSignature get_topk_init_signature() {
-  OpTaskSignature init(OpTaskType::INIT);
-
-  init.add_arg_slot<TopKAttrs>(ATTRS);
-  init.add_return_value<TopKPerDeviceState>();
-
-  return init;
-}
 OpTaskSignature get_topk_fwd_signature() {
   OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
   fwd.add_arg_slot<TopKAttrs>(ATTRS);
-  fwd.add_unchecked_arg_slot<TopKPerDeviceState>(PER_DEVICE_STATE);
+  fwd.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
 
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
   fwd.add_output_slot(INDICES);
   return fwd;
 }
+
 OpTaskSignature get_topk_bwd_signature() {
   OpTaskSignature bwd = infer_bwd_signature(get_topk_fwd_signature());
   return bwd;
 }
 
 std::vector<task_id_t> get_task_ids(TopKAttrs const &) {
-  return {task_id_t::TOPK_INIT_TASK_ID,
-          task_id_t::TOPK_FWD_TASK_ID,
-          task_id_t::TOPK_BWD_TASK_ID};
+  return {task_id_t::TOPK_FWD_TASK_ID, task_id_t::TOPK_BWD_TASK_ID};
 }
 
 }; // namespace FlexFlow
diff --git a/lib/task-spec/src/task-spec/ops/transpose.cc b/lib/task-spec/src/task-spec/ops/transpose.cc
index b6a69b0ed7..b2f94b6484 100644
--- a/lib/task-spec/src/task-spec/ops/transpose.cc
+++ b/lib/task-spec/src/task-spec/ops/transpose.cc
@@ -16,6 +16,7 @@
 #include "task-spec/ops/transpose.h"
 #include "kernels/transpose_kernels.h"
 #include "op-attrs/ops/transpose.h"
+#include "task-spec/profiling.h"
 #include "utils/integer_conversions.h"
 
 using namespace FlexFlow::Kernels::Transpose;
@@ -23,32 +24,40 @@ using namespace FlexFlow::Kernels::Transpose;
 namespace FlexFlow {
 
 enum Slots {
-  INPUT,  // tensor
-  OUTPUT, // tensor
+  INPUT,
+  OUTPUT,
   ATTRS,
   PROFILING,
+  KERNEL_DEVICE_TYPE,
 };
 
 OpTaskInvocation forward(TransposeAttrs const &attrs) {
   OpTaskBinding binding;
 
   binding.bind_arg(PROFILING, profiling_settings());
+  binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
 
-  binding.bind(INPUT, input_tensor(0));
-  binding.bind(OUTPUT, output_tensor(0));
+  binding.bind(INPUT, input_tensor(0_n));
+  binding.bind(OUTPUT, output_tensor(0_n));
 
-  return {task_id_t::TRANSPOSE_FWD_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::TRANSPOSE_FWD_TASK_ID,
+      binding,
+  };
 }
 
 static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto attrs = acc.get_argument<TransposeAttrs>(ATTRS);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
   return profile(forward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[Transpose] Forward_time = {:.2lf} [ms]",
                  attrs,
                  input,
@@ -59,12 +68,15 @@ static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto attrs = acc.get_argument<TransposeAttrs>(ATTRS);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
 
   auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
 
   return profile(backward_kernel,
                  profiling,
+                 kernel_device_type,
                  "[Transpose] Backward_time = {:.2lf} [ms]",
                  attrs,
                  output_grad,
@@ -74,7 +86,10 @@ static std::optional<float>
 OpTaskInvocation backward(TransposeAttrs const &attrs) {
   OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding);
 
-  return {task_id_t::TRANSPOSE_BWD_TASK_ID, binding};
+  return OpTaskInvocation{
+      task_id_t::TRANSPOSE_BWD_TASK_ID,
+      binding,
+  };
 }
 
 TaskImplFunction get_transpose_fwd_task_impl() {
@@ -89,6 +104,7 @@ OpTaskSignature get_transpose_fwd_signature() {
   OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
+  fwd.add_arg_slot<DeviceType>(KERNEL_DEVICE_TYPE);
 
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
diff --git a/lib/local-execution/src/optimizer.cc b/lib/task-spec/src/task-spec/optimizer.cc
similarity index 76%
rename from lib/local-execution/src/optimizer.cc
rename to lib/task-spec/src/task-spec/optimizer.cc
index 1d65172e67..c8fa23c2af 100644
--- a/lib/local-execution/src/optimizer.cc
+++ b/lib/task-spec/src/task-spec/optimizer.cc
@@ -1,4 +1,4 @@
-#include "local-execution/optimizer.h"
+#include "task-spec/optimizer.h"
 #include "kernels/optimizer_kernels.h"
 #include "task-spec/profiling.h"
 #include "utils/containers/get_only.h"
@@ -14,7 +14,8 @@ enum Slots {
   PROFILING,
   ADAM_M,
   ADAM_V,
-  HANDLE
+  HANDLE,
+  KERNEL_DEVICE_TYPE,
 };
 
 TaskSignature get_sgd_update_signature() {
@@ -25,6 +26,7 @@ TaskSignature get_sgd_update_signature() {
 
   add_arg_slot<SGDOptimizerAttrs>(sig, ATTRS);
   add_arg_slot<ProfilingSettings>(sig, PROFILING);
+  add_arg_slot<DeviceType>(sig, KERNEL_DEVICE_TYPE);
   add_unchecked_arg_slot<PerDeviceFFHandle>(
       sig, HANDLE); // how to deal with removal of ParamSync?
 
@@ -35,9 +37,9 @@ TaskSignature get_sgd_update_signature() {
 }
 
 TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs,
-                          tensor_guid_t const &weight,
-                          gradient_tensor_t const &weight_grad,
-                          optimizer_tensor_t const &sgd_v) {
+                          forward_tensor_guid_t const &weight,
+                          gradient_tensor_guid_t const &weight_grad,
+                          optimizer_tensor_guid_t const &sgd_v) {
   TaskBinding b;
   b.bind(WEIGHT, weight);
   b.bind_grad(WEIGHT_GRAD, weight_grad);
@@ -47,6 +49,7 @@ TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs,
   }
   b.bind_arg(ATTRS, attrs);
   b.bind_arg(PROFILING, profiling_settings());
+  b.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
 
   b.bind_arg(HANDLE, ff_handle());
   return TaskInvocation{task_id_t::SGD_UPD_NCCL_TASK_ID,
@@ -65,35 +68,38 @@ static void sgd_update_task_impl(TaskArgumentAccessor const &acc) {
   auto weight_grad = acc.get_tensor_grad<Permissions::RO>(WEIGHT_GRAD);
   auto weight = acc.get_tensor<Permissions::RW>(WEIGHT);
   auto profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
 
   ASSERT(weight.shape == weight_grad.shape);
-  int size = weight_grad.shape.num_elements().int_from_positive_int();
 
-  ASSERT(weight_grad.shape.num_elements().int_from_positive_int() &
-         weight.shape.num_elements().int_from_positive_int());
-  int num_replicas = weight_grad.shape.num_elements().int_from_positive_int() /
-                     weight.shape.num_elements().int_from_positive_int();
+  ASSERT(get_num_elements(weight_grad.shape.dims).int_from_positive_int() %
+             get_num_elements(weight.shape.dims).int_from_positive_int() ==
+         0);
+  int num_replicas =
+      get_num_elements(weight_grad.shape.dims).int_from_positive_int() /
+      get_num_elements(weight.shape.dims).int_from_positive_int();
 
-  float *sgd_v_ptr;
+  std::optional<GenericTensorAccessorW> sgd_v = std::nullopt;
   if (attrs.momentum > 0.0f) {
-    auto sgd_v = acc.get_optimizer_tensor<Permissions::RW>(SGD_V);
-    ASSERT(sgd_v.shape == weight.shape);
-    sgd_v_ptr = sgd_v.get_float_ptr();
+    sgd_v = acc.get_optimizer_tensor<Permissions::RW>(SGD_V);
+    ASSERT(sgd_v.value().shape == weight.shape);
   }
 
-  auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
-  profile(sgd_nccl_update_task_gpu,
+  auto handle = acc.get_argument<device_handle_t>(HANDLE);
+  profile(sgd_update_task,
           profiling,
-          "[SGD NCCL] update_time = %.2lfms\n",
+          kernel_device_type,
+          "[SGD] update_time = %.2lfms\n",
+          handle,
           attrs.lr,
           attrs.momentum,
           attrs.nesterov,
           attrs.weight_decay,
-          handle,
-          weight_grad.get_float_ptr(),
-          size,
-          weight.get_float_ptr(),
-          sgd_v_ptr); // how to deal with removal of ParamSync?
+          weight_grad,
+          num_replicas,
+          weight,
+          sgd_v); // how to deal with removal of ParamSync?
 
   // if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
   //   auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
@@ -139,6 +145,7 @@ TaskSignature get_adam_update_signature() {
 
   add_arg_slot<AdamOptimizerAttrs>(sig, ATTRS);
   add_arg_slot<ProfilingSettings>(sig, PROFILING);
+  add_arg_slot<DeviceType>(sig, KERNEL_DEVICE_TYPE);
   add_unchecked_arg_slot<PerDeviceFFHandle>(
       sig, HANDLE); // how to deal with removal of ParamSync?
   // if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) {
@@ -148,10 +155,10 @@ TaskSignature get_adam_update_signature() {
 }
 
 TaskInvocation adam_update(AdamOptimizerAttrs const &attrs,
-                           tensor_guid_t const &weight,
-                           gradient_tensor_t const &weight_grad,
-                           optimizer_tensor_t const &adam_v,
-                           optimizer_tensor_t const &adam_m) {
+                           forward_tensor_guid_t const &weight,
+                           gradient_tensor_guid_t const &weight_grad,
+                           optimizer_tensor_guid_t const &adam_v,
+                           optimizer_tensor_guid_t const &adam_m) {
   TaskBinding b;
   b.bind(WEIGHT, weight);
   b.bind_grad(WEIGHT_GRAD, weight_grad);
@@ -159,6 +166,7 @@ TaskInvocation adam_update(AdamOptimizerAttrs const &attrs,
   b.bind_optimizer(ADAM_V, adam_v);
   b.bind_arg(ATTRS, attrs);
   b.bind_arg(PROFILING, profiling_settings());
+  b.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type());
   b.bind_arg(HANDLE, ff_handle());
   return TaskInvocation{task_id_t::ADAM_UPD_NCCL_TASK_ID,
                         b}; // how to deal with removal of ParamSync?
@@ -179,24 +187,33 @@ static void adam_update_task_impl(TaskArgumentAccessor const &acc) {
   auto m_tensor = acc.get_optimizer_tensor<Permissions::RW>(ADAM_M);
 
   auto profiling = acc.get_argument<ProfilingSettings>(PROFILING);
+  DeviceType kernel_device_type =
+      acc.get_argument<DeviceType>(KERNEL_DEVICE_TYPE);
 
   ASSERT(weight.shape == weight_grad.shape);
-  int size = weight_grad.shape.num_elements().int_from_positive_int();
+  int size = get_num_elements(weight_grad.shape.dims).int_from_positive_int();
 
-  ASSERT(weight_grad.shape.num_elements() % weight.shape.num_elements() == 0);
+  ASSERT(get_num_elements(weight_grad.shape.dims).int_from_positive_int() %
+             get_num_elements(weight.shape.dims).int_from_positive_int() ==
+         0);
+  int num_replicas =
+      get_num_elements(weight_grad.shape.dims).int_from_positive_int() /
+      get_num_elements(weight.shape.dims).int_from_positive_int();
 
-  auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
-  profile(adam_nccl_update_task_gpu,
+  auto handle = acc.get_argument<device_handle_t>(HANDLE);
+  profile(adam_update_task,
           profiling,
+          kernel_device_type,
           "[Adam NCCL] update_time = %.2lfms\n",
+          handle,
           attrs.alpha_t,
           attrs.beta1,
           attrs.beta2,
           attrs.weight_decay,
           attrs.epsilon,
-          handle,
           weight_grad.get_float_ptr(),
           size,
+          num_replicas,
           m_tensor.get_float_ptr(),
           v_tensor.get_float_ptr(),
           weight.get_float_ptr()); // how to deal with removal of ParamSync?
@@ -247,9 +264,9 @@ TaskSignature get_update_signature(OptimizerAttrs const &attrs) {
 
 TaskInvocation get_update_invocation(
     OptimizerAttrs const &attrs,
-    tensor_guid_t const &weight,
-    gradient_tensor_t const &weight_grad,
-    std::vector<optimizer_tensor_t> const &grad_buffer_tensors) {
+    forward_tensor_guid_t const &weight,
+    gradient_tensor_guid_t const &weight_grad,
+    std::vector<optimizer_tensor_guid_t> const &grad_buffer_tensors) {
   return attrs.visit<TaskInvocation>(
       overload{[&](SGDOptimizerAttrs const &s) {
                  return sgd_update(
diff --git a/lib/local-execution/src/optimizer_tensor_source.cc b/lib/task-spec/src/task-spec/optimizer_tensor_source.cc
similarity index 55%
rename from lib/local-execution/src/optimizer_tensor_source.cc
rename to lib/task-spec/src/task-spec/optimizer_tensor_source.cc
index a1a9a2927d..ad7bf9f489 100644
--- a/lib/local-execution/src/optimizer_tensor_source.cc
+++ b/lib/task-spec/src/task-spec/optimizer_tensor_source.cc
@@ -1,13 +1,13 @@
-#include "local-execution/optimizer_tensor_source.h"
+#include "task-spec/optimizer_tensor_source.h"
 
 namespace FlexFlow {
 
-size_t OptimizerTensorSource::next_available_optimizer_tensor_id = 0;
+int OptimizerTensorSource::next_available_optimizer_tensor_id = 0;
 
 OptimizerTensorSource::OptimizerTensorSource() {}
 
-optimizer_tensor_t OptimizerTensorSource::new_optimizer_tensor() {
-  return optimizer_tensor_t{
+optimizer_tensor_guid_t OptimizerTensorSource::new_optimizer_tensor() {
+  return optimizer_tensor_guid_t{
       OptimizerTensorSource::next_available_optimizer_tensor_id++};
 }
 
diff --git a/lib/task-spec/src/task-spec/profiling.cc b/lib/task-spec/src/task-spec/profiling.cc
new file mode 100644
index 0000000000..e32a2e564c
--- /dev/null
+++ b/lib/task-spec/src/task-spec/profiling.cc
@@ -0,0 +1 @@
+#include "task-spec/profiling.h"
diff --git a/lib/task-spec/src/task-spec/runtime_arg_config.cc b/lib/task-spec/src/task-spec/runtime_arg_config.cc
new file mode 100644
index 0000000000..9f3dc61545
--- /dev/null
+++ b/lib/task-spec/src/task-spec/runtime_arg_config.cc
@@ -0,0 +1,30 @@
+#include "task-spec/runtime_arg_config.h"
+#include "kernels/device_handle_t.h"
+
+namespace FlexFlow {
+
+RuntimeArgConfig
+    cpu_make_runtime_arg_config(EnableProfiling enable_profiling,
+                                ProfilingSettings profiling_settings) {
+  return RuntimeArgConfig{
+      DeviceSpecific<device_handle_t>::create(cpu_make_device_handle_t()),
+      enable_profiling,
+      profiling_settings,
+      DeviceType::CPU,
+  };
+}
+
+RuntimeArgConfig
+    gpu_make_runtime_arg_config(PerDeviceFFHandle const &ff_handle,
+                                EnableProfiling enable_profiling,
+                                ProfilingSettings profiling_settings) {
+  return RuntimeArgConfig{
+      DeviceSpecific<device_handle_t>::create(
+          gpu_make_device_handle_t(ff_handle)),
+      enable_profiling,
+      profiling_settings,
+      DeviceType::GPU,
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/task-spec/src/task-spec/runtime_arg_ref.cc b/lib/task-spec/src/task-spec/runtime_arg_ref.cc
index bb4625c113..3aa1b7f907 100644
--- a/lib/task-spec/src/task-spec/runtime_arg_ref.cc
+++ b/lib/task-spec/src/task-spec/runtime_arg_ref.cc
@@ -1,26 +1,14 @@
 #include "task-spec/runtime_arg_ref.h"
+#include "kernels/device_handle_t.dtg.h"
 #include "task-spec/device_specific.h"
 
 namespace FlexFlow {
 
-std::string to_string(RuntimeArgRefType const &runtime_arg_ref_type) {
-  switch (runtime_arg_ref_type) {
-    case RuntimeArgRefType::FF_HANDLE:
-      return "FF_HANDLE";
-    case RuntimeArgRefType::PROFILING_SETTINGS:
-      return "PROFILING_SETTINGS";
-    case RuntimeArgRefType::FF_ITERATION_CONFIG:
-      return "FF_ITERATION_CONFIG";
-    default:
-      return "Unknown";
-  }
-}
-
 RuntimeArgRef<ProfilingSettings> profiling_settings() {
   return {RuntimeArgRefType::PROFILING_SETTINGS};
 }
 
-RuntimeArgRef<DeviceSpecific<PerDeviceFFHandle>> ff_handle() {
+RuntimeArgRef<DeviceSpecific<device_handle_t>> ff_handle() {
   return {RuntimeArgRefType::FF_HANDLE};
 }
 
@@ -28,4 +16,8 @@ RuntimeArgRef<FFIterationConfig> iteration_config() {
   return {RuntimeArgRefType::FF_ITERATION_CONFIG};
 }
 
+RuntimeArgRef<DeviceType> kernel_device_type() {
+  return {RuntimeArgRefType::KERNEL_DEVICE_TYPE};
+}
+
 } // namespace FlexFlow
diff --git a/lib/task-spec/src/task-spec/task_invocation.cc b/lib/task-spec/src/task-spec/task_invocation.cc
index e182231bda..0677ff6e60 100644
--- a/lib/task-spec/src/task-spec/task_invocation.cc
+++ b/lib/task-spec/src/task-spec/task_invocation.cc
@@ -7,7 +7,6 @@ namespace FlexFlow {
 bool is_invocation_valid(TaskSignature const &sig, TaskInvocation const &inv) {
   TaskBinding binding = inv.binding;
 
-  // args
   for (std::pair<slot_id_t, TaskArgSpec> const &arg_binding :
        binding.get_arg_bindings()) {
     if (sig.task_arg_types.count(arg_binding.first)) {
@@ -20,9 +19,8 @@ bool is_invocation_valid(TaskSignature const &sig, TaskInvocation const &inv) {
     }
   }
 
-  // tensors
-  for (std::pair<SlotTensorTypeId, TensorTypeVariant> const &tensor_binding :
-       binding.get_tensor_bindings()) {
+  for (std::pair<tensor_sub_slot_id_t, training_tensor_guid_t> const
+           &tensor_binding : binding.get_tensor_bindings()) {
     slot_id_t tensor_slot_id = tensor_binding.first.slot_id;
     if (sig.tensor_guid_slots.count(tensor_slot_id)) {
       if (tensor_binding.first.tensor_type ==
diff --git a/lib/task-spec/src/task-spec/task_signature_impl.cc b/lib/task-spec/src/task-spec/task_signature_impl.cc
index 7995c0af0b..8da38b5840 100644
--- a/lib/task-spec/src/task-spec/task_signature_impl.cc
+++ b/lib/task-spec/src/task-spec/task_signature_impl.cc
@@ -3,7 +3,6 @@
 #include "task-spec/ops/batch_matmul.h"
 #include "task-spec/ops/batch_norm.h"
 #include "task-spec/ops/cast.h"
-#include "task-spec/ops/combine.h"
 #include "task-spec/ops/concat.h"
 #include "task-spec/ops/conv_2d.h"
 #include "task-spec/ops/dropout.h"
@@ -18,9 +17,6 @@
 #include "task-spec/ops/noop.h"
 #include "task-spec/ops/pool_2d.h"
 #include "task-spec/ops/reduce.h"
-#include "task-spec/ops/reduction.h"
-#include "task-spec/ops/repartition.h"
-#include "task-spec/ops/replicate.h"
 #include "task-spec/ops/reshape.h"
 #include "task-spec/ops/reverse.h"
 #include "task-spec/ops/softmax.h"
@@ -32,7 +28,8 @@
 
 namespace FlexFlow {
 
-TaskSignatureAndImpl get_task_sig_impl(task_id_t const &task_id) {
+TaskSignatureAndImpl
+    get_task_signature_and_impl_for_task_id(task_id_t const &task_id) {
   switch (task_id) {
     case task_id_t::ELEMENTBINARY_INIT_TASK_ID:
       return TaskSignatureAndImpl{get_element_binary_init_task_impl(),
@@ -70,12 +67,12 @@ TaskSignatureAndImpl get_task_sig_impl(task_id_t const &task_id) {
     case task_id_t::DROPOUT_BWD_TASK_ID:
       return TaskSignatureAndImpl{get_dropout_bwd_task_impl(),
                                   get_dropout_bwd_signature()};
-    // case task_id_t::EMBED_FWD_TASK_ID:
-    //   return TaskSignatureAndImpl{get_embedding_fwd_task_impl(),
-    //   get_embedding_fwd_signature()};
-    // case task_id_t::EMBED_BWD_TASK_ID:
-    //   return TaskSignatureAndImpl{get_embedding_bwd_task_impl(),
-    //   get_embedding_bwd_signature()};
+    case task_id_t::EMBED_FWD_TASK_ID:
+      return TaskSignatureAndImpl{get_embedding_fwd_task_impl(),
+                                  get_embedding_fwd_signature()};
+    case task_id_t::EMBED_BWD_TASK_ID:
+      return TaskSignatureAndImpl{get_embedding_bwd_task_impl(),
+                                  get_embedding_bwd_signature()};
     case task_id_t::GATHER_INIT_TASK_ID:
       return TaskSignatureAndImpl{get_gather_init_task_impl(),
                                   get_gather_init_signature()};
@@ -169,9 +166,6 @@ TaskSignatureAndImpl get_task_sig_impl(task_id_t const &task_id) {
     case task_id_t::REDUCE_BWD_TASK_ID:
       return TaskSignatureAndImpl{get_reduce_bwd_task_impl(),
                                   get_reduce_bwd_signature()};
-    case task_id_t::RESHAPE_INIT_TASK_ID:
-      return TaskSignatureAndImpl{get_reshape_init_task_impl(),
-                                  get_reshape_init_signature()};
     case task_id_t::RESHAPE_FWD_TASK_ID:
       return TaskSignatureAndImpl{get_reshape_fwd_task_impl(),
                                   get_reshape_fwd_signature()};
@@ -184,9 +178,6 @@ TaskSignatureAndImpl get_task_sig_impl(task_id_t const &task_id) {
     case task_id_t::REVERSE_BWD_TASK_ID:
       return TaskSignatureAndImpl{get_reverse_bwd_task_impl(),
                                   get_reverse_bwd_signature()};
-    case task_id_t::TOPK_INIT_TASK_ID:
-      return TaskSignatureAndImpl{get_topk_init_task_impl(),
-                                  get_topk_init_signature()};
     case task_id_t::TOPK_FWD_TASK_ID:
       return TaskSignatureAndImpl{get_topk_fwd_task_impl(),
                                   get_topk_fwd_signature()};
@@ -208,37 +199,8 @@ TaskSignatureAndImpl get_task_sig_impl(task_id_t const &task_id) {
     case task_id_t::ATTENTION_BWD_TASK_ID:
       return TaskSignatureAndImpl{get_attention_bwd_task_impl(),
                                   get_attention_bwd_signature()};
-    case task_id_t::COMBINE_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_combine_fwd_task_impl(),
-                                  get_combine_fwd_signature()};
-    case task_id_t::COMBINE_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_combine_bwd_task_impl(),
-                                  get_combine_bwd_signature()};
-    case task_id_t::REDUCTION_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_reduction_fwd_task_impl(),
-                                  get_reduction_fwd_signature()};
-    case task_id_t::REDUCTION_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_reduction_bwd_task_impl(),
-                                  get_reduction_bwd_signature()};
-    case task_id_t::REPARTITION_INIT_TASK_ID:
-      return TaskSignatureAndImpl{get_repartition_init_task_impl(),
-                                  get_repartition_init_signature()};
-    case task_id_t::REPARTITION_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_repartition_fwd_task_impl(),
-                                  get_repartition_fwd_signature()};
-    case task_id_t::REPARTITION_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_repartition_bwd_task_impl(),
-                                  get_repartition_bwd_signature()};
-    case task_id_t::REPLICATE_FWD_TASK_ID:
-      return TaskSignatureAndImpl{get_replicate_fwd_task_impl(),
-                                  get_replicate_fwd_signature()};
-    case task_id_t::REPLICATE_BWD_TASK_ID:
-      return TaskSignatureAndImpl{get_replicate_bwd_task_impl(),
-                                  get_replicate_bwd_signature()};
     default:
-      throw mk_runtime_error(
-          fmt::format("Invalid task ID")); // inserting task_id yields
-                                           // "type_is_unformattable" error
+      PANIC("Unhandled task ID", task_id);
   }
 }
 
@@ -252,9 +214,7 @@ std::vector<task_id_t> get_task_ids(ComputationGraphOpAttrs const &op) {
       [](DropoutAttrs const &attrs) { return get_task_ids(attrs); },
       [](ElementBinaryAttrs const &attrs) { return get_task_ids(attrs); },
       [](ElementUnaryAttrs const &attrs) { return get_task_ids(attrs); },
-      // [](EmbeddingAttrs const & attrs) {
-      //   return get_task_ids(attrs);
-      // },
+      [](EmbeddingAttrs const &attrs) { return get_task_ids(attrs); },
       [](FlatAttrs const &attrs) { return get_task_ids(attrs); },
       [](GatherAttrs const &attrs) { return get_task_ids(attrs); },
       [](InputAttrs const &attrs) { return get_task_ids(attrs); },
@@ -277,7 +237,8 @@ std::vector<task_id_t> get_task_ids(ComputationGraphOpAttrs const &op) {
   });
 }
 
-OpTaskInvocation init(ComputationGraphOpAttrs const &op) {
+OpTaskInvocation
+    get_init_op_task_invocation(ComputationGraphOpAttrs const &op) {
   return op.visit<OpTaskInvocation>(overload{
       [](BatchNormAttrs const &attrs) { return init(attrs); },
       [](Conv2DAttrs const &attrs) { return init(attrs); },
@@ -290,16 +251,15 @@ OpTaskInvocation init(ComputationGraphOpAttrs const &op) {
       [](MultiHeadAttentionAttrs const &attrs) { return init(attrs); },
       [](Pool2DAttrs const &attrs) { return init(attrs); },
       [](ReduceAttrs const &attrs) { return init(attrs); },
-      [](ReshapeAttrs const &attrs) { return init(attrs); },
       [](SoftmaxAttrs const &attrs) { return init(attrs); },
-      [](TopKAttrs const &attrs) { return init(attrs); },
       [](auto const &attrs) -> OpTaskInvocation {
-        throw mk_runtime_error(fmt::format("Unhandled attr type {}", attrs));
+        PANIC("Unhandled attr type", attrs);
       },
   });
 }
 
-OpTaskInvocation forward(ComputationGraphOpAttrs const &op) {
+OpTaskInvocation
+    get_forward_op_task_invocation(ComputationGraphOpAttrs const &op) {
   return op.visit<OpTaskInvocation>(overload{
       [](BatchMatmulAttrs const &attrs) { return forward(attrs); },
       [](BatchNormAttrs const &attrs) { return forward(attrs); },
@@ -309,9 +269,7 @@ OpTaskInvocation forward(ComputationGraphOpAttrs const &op) {
       [](DropoutAttrs const &attrs) { return forward(attrs); },
       [](ElementBinaryAttrs const &attrs) { return forward(attrs); },
       [](ElementUnaryAttrs const &attrs) { return forward(attrs); },
-      // [](EmbeddingAttrs const & attrs) {
-      //   return forward(attrs);
-      // },
+      [](EmbeddingAttrs const &attrs) { return forward(attrs); },
       [](FlatAttrs const &attrs) { return forward(attrs); },
       [](GatherAttrs const &attrs) { return forward(attrs); },
       [](LayerNormAttrs const &attrs) { return forward(attrs); },
@@ -331,7 +289,8 @@ OpTaskInvocation forward(ComputationGraphOpAttrs const &op) {
   });
 }
 
-OpTaskInvocation backward(ComputationGraphOpAttrs const &op) {
+OpTaskInvocation
+    get_backward_op_task_invocation(ComputationGraphOpAttrs const &op) {
   return op.visit<OpTaskInvocation>(overload{
       [](BatchMatmulAttrs const &attrs) { return backward(attrs); },
       [](BatchNormAttrs const &attrs) { return backward(attrs); },
@@ -341,9 +300,7 @@ OpTaskInvocation backward(ComputationGraphOpAttrs const &op) {
       [](DropoutAttrs const &attrs) { return backward(attrs); },
       [](ElementBinaryAttrs const &attrs) { return backward(attrs); },
       [](ElementUnaryAttrs const &attrs) { return backward(attrs); },
-      // [](EmbeddingAttrs const & attrs) {
-      //   return backward(attrs);
-      // },
+      [](EmbeddingAttrs const &attrs) { return backward(attrs); },
       [](FlatAttrs const &attrs) { return backward(attrs); },
       [](GatherAttrs const &attrs) { return backward(attrs); },
       [](LayerNormAttrs const &attrs) { return backward(attrs); },
@@ -358,7 +315,7 @@ OpTaskInvocation backward(ComputationGraphOpAttrs const &op) {
       [](TopKAttrs const &attrs) { return backward(attrs); },
       [](TransposeAttrs const &attrs) { return backward(attrs); },
       [](auto const &attrs) -> OpTaskInvocation {
-        throw mk_runtime_error(fmt::format("Unhandled attr type {}", attrs));
+        PANIC("Unhandled attr type", attrs);
       },
   });
 }
diff --git a/lib/task-spec/src/task-spec/training_computation_graph.cc b/lib/task-spec/src/task-spec/training_computation_graph.cc
new file mode 100644
index 0000000000..f50930d684
--- /dev/null
+++ b/lib/task-spec/src/task-spec/training_computation_graph.cc
@@ -0,0 +1,183 @@
+#include "task-spec/training_computation_graph.h"
+#include "task-spec/loss_tensor_source.h"
+#include "task-spec/training_tensor_group.h"
+#include "task-spec/training_tensor_group_with_attrs.h"
+#include "utils/containers/contains.h"
+#include "utils/containers/filter_values.h"
+#include "utils/containers/flatmap.h"
+#include "utils/containers/generate_map.h"
+#include "utils/containers/get_only.h"
+#include "utils/containers/keys.h"
+#include "utils/containers/set_of.h"
+#include "utils/containers/transform.h"
+#include "utils/overload.h"
+
+namespace FlexFlow {
+
+TrainingComputationGraph generate_training_computation_graph(
+    ComputationGraph const &computation_graph,
+    OptimizerAttrs const &optimizer_attrs,
+    tensor_guid_t const &logit_tensor,
+    ForwardTensorSource &forward_tensor_source,
+    GradientTensorSource &gradient_tensor_source,
+    OptimizerTensorSource &optimizer_tensor_source,
+    LossTensorSource &loss_tensor_source) {
+
+  loss_tensor_guid_t label_tensor = loss_tensor_source.new_loss_tensor();
+
+  return TrainingComputationGraph{
+      /*computation_graph=*/computation_graph,
+      /*training_tensor_group_for_tensor=*/
+      transform(
+          get_all_tensor_attrs(computation_graph),
+          [&](tensor_guid_t tensor_guid, TensorAttrs const &tensor_attrs) {
+            return std::pair{
+                tensor_guid,
+                make_training_tensor_group_for_tensor_guid_t(
+                    /*tensor_guid=*/tensor_guid,
+                    /*tensor_attrs=*/tensor_attrs,
+                    /*optimizer_attrs=*/optimizer_attrs,
+                    /*forward_tensor_source=*/forward_tensor_source,
+                    /*gradient_tensor_source=*/gradient_tensor_source,
+                    /*optimizer_tensor_source=*/optimizer_tensor_source),
+            };
+          }),
+      /*logit_tensor=*/logit_tensor,
+      /*label_tensor=*/label_tensor,
+  };
+}
+
+TrainingTensorGroup get_training_tensor_group_for_tensor_guid(
+    TrainingComputationGraph const &training_cg, tensor_guid_t tensor_guid) {
+
+  return training_cg.training_tensor_group_for_tensor.at(tensor_guid);
+}
+
+TrainingTensorGroupWithAttrs
+    get_training_tensor_group_with_attrs_for_tensor_guid(
+        TrainingComputationGraph const &training_cg,
+        tensor_guid_t tensor_guid) {
+  return make_training_tensor_group_with_attrs_from_group_and_attrs(
+      /*group=*/get_training_tensor_group_for_tensor_guid(training_cg,
+                                                          tensor_guid),
+      /*attrs=*/get_tensor_attrs(training_cg.computation_graph, tensor_guid));
+}
+
+forward_tensor_guid_t get_forward_tensor_guid_for_tensor_guid(
+    TrainingComputationGraph const &training_cg, tensor_guid_t t) {
+  return training_cg.training_tensor_group_for_tensor.at(t).forward_tensor;
+}
+
+gradient_tensor_guid_t get_gradient_tensor_guid_for_tensor_guid(
+    TrainingComputationGraph const &training_cg, tensor_guid_t t) {
+  return training_cg.training_tensor_group_for_tensor.at(t).gradient_tensor;
+}
+
+std::vector<optimizer_tensor_guid_t> get_optimizer_tensor_guids_for_tensor_guid(
+    TrainingComputationGraph const &training_cg, tensor_guid_t t) {
+  return training_cg.training_tensor_group_for_tensor.at(t).optimizer_tensors;
+}
+
+tensor_guid_t get_tensor_guid_for_forward_tensor_guid(
+    TrainingComputationGraph const &training_cg, forward_tensor_guid_t t) {
+  return get_only(keys(filter_values(
+      training_cg.training_tensor_group_for_tensor,
+      [&](TrainingTensorGroup const &g) { return g.forward_tensor == t; })));
+}
+
+tensor_guid_t get_tensor_guid_for_gradient_tensor_guid(
+    TrainingComputationGraph const &training_cg, gradient_tensor_guid_t t) {
+  return get_only(keys(filter_values(
+      training_cg.training_tensor_group_for_tensor,
+      [&](TrainingTensorGroup const &g) { return g.gradient_tensor == t; })));
+}
+
+tensor_guid_t get_tensor_guid_for_optimizer_tensor_guid(
+    TrainingComputationGraph const &training_cg, optimizer_tensor_guid_t t) {
+  return get_only(
+      keys(filter_values(training_cg.training_tensor_group_for_tensor,
+                         [&](TrainingTensorGroup const &g) {
+                           return contains(g.optimizer_tensors, t);
+                         })));
+}
+
+tensor_guid_t get_tensor_guid_for_training_tensor_guid(
+    TrainingComputationGraph const &training_cg, training_tensor_guid_t t) {
+  return t.visit<tensor_guid_t>(overload{
+      [&](forward_tensor_guid_t forward_tensor) {
+        return get_tensor_guid_for_forward_tensor_guid(training_cg,
+                                                       forward_tensor);
+      },
+      [&](gradient_tensor_guid_t gradient_tensor) {
+        return get_tensor_guid_for_gradient_tensor_guid(training_cg,
+                                                        gradient_tensor);
+      },
+      [&](optimizer_tensor_guid_t optimizer_tensor) {
+        return get_tensor_guid_for_optimizer_tensor_guid(training_cg,
+                                                         optimizer_tensor);
+      },
+      [&](loss_tensor_guid_t loss_tensor) -> tensor_guid_t {
+        PANIC("no tensor_guid_t can exist for a loss_tensor_guid_t");
+      },
+  });
+}
+
+std::unordered_set<training_tensor_guid_t>
+    get_all_training_tensors_in_training_computation_graph(
+        TrainingComputationGraph const &training_cg) {
+  std::unordered_set<training_tensor_guid_t> result = flatmap(
+      unordered_set_of(keys(training_cg.training_tensor_group_for_tensor)),
+      [&](tensor_guid_t t) {
+        return get_all_training_tensors_in_tensor_group(
+            training_cg.training_tensor_group_for_tensor.at(t));
+      });
+
+  result.insert(training_tensor_guid_t{training_cg.label_tensor});
+  return result;
+}
+
+TrainingLayerPlusContext
+    get_training_layer_plus_context(TrainingComputationGraph const &training_cg,
+                                    layer_guid_t layer_guid) {
+  auto get_tensor_group_with_attrs =
+      [&](tensor_guid_t t) -> TrainingTensorGroupWithAttrs {
+    return get_training_tensor_group_with_attrs_for_tensor_guid(training_cg, t);
+  };
+
+  return TrainingLayerPlusContext{
+      /*layer_guid=*/layer_guid,
+      /*layer_attrs=*/
+      get_layer_attrs(training_cg.computation_graph, layer_guid),
+      /*input_tensor_groups=*/
+      transform(get_incoming_inputs(training_cg.computation_graph, layer_guid),
+                get_tensor_group_with_attrs),
+      /*weight_tensor_groups=*/
+      transform(get_incoming_weights(training_cg.computation_graph, layer_guid),
+                get_tensor_group_with_attrs),
+      /*output_tensor_groups=*/
+      transform(get_outgoing_tensors(training_cg.computation_graph, layer_guid),
+                get_tensor_group_with_attrs),
+  };
+}
+
+std::unordered_map<training_tensor_guid_t, TensorShape>
+    get_all_training_tensor_shapes(
+        TrainingComputationGraph const &training_cg) {
+  return generate_map(
+      get_all_training_tensors_in_training_computation_graph(training_cg),
+      [&](training_tensor_guid_t t) {
+        if (t.is_loss_tensor()) {
+          ASSERT(t == training_tensor_guid_t{training_cg.label_tensor});
+          return get_tensor_attrs(training_cg.computation_graph,
+                                  training_cg.logit_tensor)
+              .shape;
+        }
+
+        return get_tensor_attrs(
+                   training_cg.computation_graph,
+                   get_tensor_guid_for_training_tensor_guid(training_cg, t))
+            .shape;
+      });
+}
+
+} // namespace FlexFlow
diff --git a/lib/task-spec/src/task-spec/training_layer_plus_context.cc b/lib/task-spec/src/task-spec/training_layer_plus_context.cc
new file mode 100644
index 0000000000..9adbc6b2a1
--- /dev/null
+++ b/lib/task-spec/src/task-spec/training_layer_plus_context.cc
@@ -0,0 +1,122 @@
+#include "task-spec/training_layer_plus_context.h"
+#include "task-spec/training_tensor_group_with_attrs.h"
+#include "utils/containers/transform.h"
+
+namespace FlexFlow {
+
+std::vector<TrainingTensorGroupWithAttrs>
+    get_training_tensor_groups_with_attrs_for_role(
+        TrainingLayerPlusContext const &training_layer_plus_context,
+        TensorRole tensor_role) {
+
+  switch (tensor_role) {
+    case TensorRole::INPUT:
+      return training_layer_plus_context.input_tensor_groups;
+    case TensorRole::WEIGHT:
+      return training_layer_plus_context.weight_tensor_groups;
+    case TensorRole::OUTPUT:
+      return training_layer_plus_context.output_tensor_groups;
+    default:
+      PANIC("Unhandled TensorRole {}", tensor_role);
+  }
+}
+
+TrainingTensorGroupWithAttrs
+    get_training_tensor_group_with_attrs_for_role_and_index(
+        TrainingLayerPlusContext const &training_layer_plus_context,
+        TensorRole tensor_role,
+        nonnegative_int index) {
+
+  return get_training_tensor_groups_with_attrs_for_role(
+             training_layer_plus_context, tensor_role)
+      .at(index.unwrap_nonnegative());
+}
+
+std::vector<forward_tensor_guid_t>
+    get_input_tensors(TrainingLayerPlusContext const &l) {
+  return transform(
+      l.input_tensor_groups,
+      [](TrainingTensorGroupWithAttrs const &g) { return g.forward_tensor; });
+}
+
+std::vector<gradient_tensor_guid_t>
+    get_input_grad_tensors(TrainingLayerPlusContext const &l) {
+  return transform(
+      l.input_tensor_groups,
+      [](TrainingTensorGroupWithAttrs const &g) { return g.gradient_tensor; });
+}
+
+std::vector<TensorShape>
+    get_input_tensor_shapes(TrainingLayerPlusContext const &l) {
+  return transform(l.input_tensor_groups,
+                   [](TrainingTensorGroupWithAttrs const &g) {
+                     return g.tensor_attrs.shape;
+                   });
+}
+
+std::vector<forward_tensor_guid_t>
+    get_weight_tensors(TrainingLayerPlusContext const &l) {
+  return transform(
+      l.weight_tensor_groups,
+      [](TrainingTensorGroupWithAttrs const &g) { return g.forward_tensor; });
+}
+
+std::vector<gradient_tensor_guid_t>
+    get_weight_grad_tensors(TrainingLayerPlusContext const &l) {
+  return transform(
+      l.weight_tensor_groups,
+      [](TrainingTensorGroupWithAttrs const &g) { return g.gradient_tensor; });
+}
+
+std::vector<TensorShape>
+    get_weight_tensor_shapes(TrainingLayerPlusContext const &l) {
+  return transform(l.weight_tensor_groups,
+                   [](TrainingTensorGroupWithAttrs const &g) {
+                     return g.tensor_attrs.shape;
+                   });
+}
+
+std::vector<forward_tensor_guid_t>
+    get_output_tensors(TrainingLayerPlusContext const &l) {
+  return transform(
+      l.output_tensor_groups,
+      [](TrainingTensorGroupWithAttrs const &g) { return g.forward_tensor; });
+}
+
+std::vector<gradient_tensor_guid_t>
+    get_output_grad_tensors(TrainingLayerPlusContext const &l) {
+  return transform(
+      l.output_tensor_groups,
+      [](TrainingTensorGroupWithAttrs const &g) { return g.gradient_tensor; });
+}
+
+std::vector<TensorShape>
+    get_output_tensor_shapes(TrainingLayerPlusContext const &l) {
+  return transform(l.output_tensor_groups,
+                   [](TrainingTensorGroupWithAttrs const &g) {
+                     return g.tensor_attrs.shape;
+                   });
+}
+
+TrainingLayerTensorGroupSignature
+    get_tensor_group_signature(TrainingLayerPlusContext const &l) {
+  return TrainingLayerTensorGroupSignature{
+      /*input_tensor_groups=*/transform(l.input_tensor_groups,
+                                        tensor_group_without_attrs),
+      /*weight_tensor_groups=*/
+      transform(l.weight_tensor_groups, tensor_group_without_attrs),
+      /*output_tensor_groups=*/
+      transform(l.output_tensor_groups, tensor_group_without_attrs),
+  };
+}
+
+CGOperatorTensorShapeSignature
+    get_cg_op_shape_signature(TrainingLayerPlusContext const &l) {
+  return CGOperatorTensorShapeSignature{
+      /*input_shapes=*/get_input_tensor_shapes(l),
+      /*weight_shapes=*/get_weight_tensor_shapes(l),
+      /*output_shapes=*/get_output_tensor_shapes(l),
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/task-spec/src/task-spec/training_layer_tensor_group_signature.cc b/lib/task-spec/src/task-spec/training_layer_tensor_group_signature.cc
new file mode 100644
index 0000000000..db8b8015ec
--- /dev/null
+++ b/lib/task-spec/src/task-spec/training_layer_tensor_group_signature.cc
@@ -0,0 +1,31 @@
+#include "task-spec/training_layer_tensor_group_signature.h"
+#include <libassert/assert.hpp>
+
+namespace FlexFlow {
+
+std::vector<TrainingTensorGroup> get_training_tensor_groups_for_role(
+    TrainingLayerTensorGroupSignature const &signature,
+    TensorRole tensor_role) {
+
+  switch (tensor_role) {
+    case TensorRole::INPUT:
+      return signature.input_tensor_groups;
+    case TensorRole::WEIGHT:
+      return signature.weight_tensor_groups;
+    case TensorRole::OUTPUT:
+      return signature.output_tensor_groups;
+    default:
+      PANIC("Unhandled TensorRole {}", tensor_role);
+  }
+}
+
+TrainingTensorGroup get_training_tensor_group_for_role_and_index(
+    TrainingLayerTensorGroupSignature const &signature,
+    TensorRole tensor_role,
+    nonnegative_int index) {
+
+  return get_training_tensor_groups_for_role(signature, tensor_role)
+      .at(index.unwrap_nonnegative());
+}
+
+} // namespace FlexFlow
diff --git a/lib/task-spec/src/task-spec/training_tensor_group.cc b/lib/task-spec/src/task-spec/training_tensor_group.cc
new file mode 100644
index 0000000000..0f6710b80f
--- /dev/null
+++ b/lib/task-spec/src/task-spec/training_tensor_group.cc
@@ -0,0 +1,48 @@
+#include "task-spec/training_tensor_group.h"
+#include "pcg/optimizer_attrs.h"
+#include "utils/containers/repeat.h"
+#include "utils/containers/set_union.h"
+#include "utils/containers/transform.h"
+#include "utils/containers/unordered_set_of.h"
+
+namespace FlexFlow {
+
+TrainingTensorGroup make_training_tensor_group_for_tensor_guid_t(
+    tensor_guid_t tensor_guid,
+    TensorAttrs const &tensor_attrs,
+    OptimizerAttrs const &optimizer_attrs,
+    ForwardTensorSource &forward_tensor_source,
+    GradientTensorSource &gradient_tensor_source,
+    OptimizerTensorSource &optimizer_tensor_source) {
+
+  nonnegative_int num_optimizer_tensors = [&]() {
+    if (tensor_attrs.create_grad == CreateGrad::YES) {
+      return get_num_optimizer_tensors(optimizer_attrs);
+    } else {
+      return 0_n;
+    }
+  }();
+
+  return TrainingTensorGroup{
+      /*forward_tensor=*/forward_tensor_source.new_forward_tensor(),
+      /*gradient_tensor=*/gradient_tensor_source.new_gradient_tensor(),
+      /*optimizer_tensors=*/
+      repeat(num_optimizer_tensors,
+             [&]() { return optimizer_tensor_source.new_optimizer_tensor(); }),
+  };
+}
+
+std::unordered_set<training_tensor_guid_t>
+    get_all_training_tensors_in_tensor_group(TrainingTensorGroup const &group) {
+  return set_union(
+      std::unordered_set{
+          training_tensor_guid_t{group.forward_tensor},
+          training_tensor_guid_t{group.gradient_tensor},
+      },
+      transform(unordered_set_of(group.optimizer_tensors),
+                [](optimizer_tensor_guid_t optimizer_tensor) {
+                  return training_tensor_guid_t{optimizer_tensor};
+                }));
+}
+
+} // namespace FlexFlow
diff --git a/lib/task-spec/src/task-spec/training_tensor_group_with_attrs.cc b/lib/task-spec/src/task-spec/training_tensor_group_with_attrs.cc
new file mode 100644
index 0000000000..6014b46446
--- /dev/null
+++ b/lib/task-spec/src/task-spec/training_tensor_group_with_attrs.cc
@@ -0,0 +1,26 @@
+#include "task-spec/training_tensor_group_with_attrs.h"
+
+namespace FlexFlow {
+
+TrainingTensorGroupWithAttrs
+    make_training_tensor_group_with_attrs_from_group_and_attrs(
+        TrainingTensorGroup const &group, TensorAttrs const &attrs) {
+
+  return TrainingTensorGroupWithAttrs{
+      /*tensor_attrs=*/attrs,
+      /*forward_tensor=*/group.forward_tensor,
+      /*gradient_tensor=*/group.gradient_tensor,
+      /*optimizer_tensors=*/group.optimizer_tensors,
+  };
+}
+
+TrainingTensorGroup
+    tensor_group_without_attrs(TrainingTensorGroupWithAttrs const &with_attrs) {
+  return TrainingTensorGroup{
+      /*forward_tensor=*/with_attrs.forward_tensor,
+      /*gradient_tensor=*/with_attrs.gradient_tensor,
+      /*optimizer_tensors=*/with_attrs.optimizer_tensors,
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/task-spec/test/src/task-spec/training_tensor_group.cc b/lib/task-spec/test/src/task-spec/training_tensor_group.cc
new file mode 100644
index 0000000000..b40c38ce69
--- /dev/null
+++ b/lib/task-spec/test/src/task-spec/training_tensor_group.cc
@@ -0,0 +1,36 @@
+#include "task-spec/training_tensor_group.h"
+#include "test/utils/doctest/fmt/unordered_set.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("get_all_training_tensors_in_tensor_group") {
+    forward_tensor_guid_t forward_tensor = forward_tensor_guid_t{3};
+    gradient_tensor_guid_t gradient_tensor = gradient_tensor_guid_t{5};
+    optimizer_tensor_guid_t optimizer_tensor1 = optimizer_tensor_guid_t{8};
+    optimizer_tensor_guid_t optimizer_tensor2 = optimizer_tensor_guid_t{3};
+
+    std::vector<optimizer_tensor_guid_t> optimizer_tensors = {
+        optimizer_tensor1,
+        optimizer_tensor2,
+    };
+
+    TrainingTensorGroup training_tensor_group = TrainingTensorGroup{
+        /*forward_tensor=*/forward_tensor,
+        /*gradient_tensor=*/gradient_tensor,
+        /*optimizer_tensors=*/optimizer_tensors,
+    };
+
+    std::unordered_set<training_tensor_guid_t> result =
+        get_all_training_tensors_in_tensor_group(training_tensor_group);
+    std::unordered_set<training_tensor_guid_t> correct = {
+        training_tensor_guid_t{forward_tensor},
+        training_tensor_guid_t{gradient_tensor},
+        training_tensor_guid_t{optimizer_tensor1},
+        training_tensor_guid_t{optimizer_tensor2},
+    };
+
+    CHECK(result == correct);
+  }
+}
diff --git a/lib/task-spec/test/src/task-spec/training_tensor_group_with_attrs.cc b/lib/task-spec/test/src/task-spec/training_tensor_group_with_attrs.cc
new file mode 100644
index 0000000000..f769a877ad
--- /dev/null
+++ b/lib/task-spec/test/src/task-spec/training_tensor_group_with_attrs.cc
@@ -0,0 +1,84 @@
+#include "task-spec/training_tensor_group_with_attrs.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("make_training_tensor_group_with_attrs_from_group_and_attrs") {
+    TensorAttrs tensor_attrs = TensorAttrs{
+        /*shape=*/TensorShape{
+            /*dims=*/TensorDims{FFOrdered{
+                8_p,
+                2_p,
+                3_p,
+            }},
+            /*data_type=*/DataType::FLOAT,
+        },
+        /*create_grad=*/CreateGrad::YES,
+    };
+
+    forward_tensor_guid_t forward_tensor = forward_tensor_guid_t{3};
+    gradient_tensor_guid_t gradient_tensor = gradient_tensor_guid_t{5};
+    std::vector<optimizer_tensor_guid_t> optimizer_tensors = {
+        optimizer_tensor_guid_t{8},
+        optimizer_tensor_guid_t{3},
+    };
+
+    TrainingTensorGroup training_tensor_group = TrainingTensorGroup{
+        /*forward_tensor=*/forward_tensor,
+        /*gradient_tensor=*/gradient_tensor,
+        /*optimizer_tensors=*/optimizer_tensors,
+    };
+
+    TrainingTensorGroupWithAttrs result =
+        make_training_tensor_group_with_attrs_from_group_and_attrs(
+            training_tensor_group, tensor_attrs);
+    TrainingTensorGroupWithAttrs correct = TrainingTensorGroupWithAttrs{
+        /*tensor_attrs=*/tensor_attrs,
+        /*forward_tensor=*/forward_tensor,
+        /*gradient_tensor=*/gradient_tensor,
+        /*optimizer_tensors=*/optimizer_tensors,
+    };
+
+    CHECK(result == correct);
+  }
+
+  TEST_CASE("tensor_group_without_attrs") {
+    TensorAttrs tensor_attrs = TensorAttrs{
+        /*shape=*/TensorShape{
+            /*dims=*/TensorDims{FFOrdered{
+                8_p,
+                2_p,
+                3_p,
+            }},
+            /*data_type=*/DataType::FLOAT,
+        },
+        /*create_grad=*/CreateGrad::YES,
+    };
+
+    forward_tensor_guid_t forward_tensor = forward_tensor_guid_t{3};
+    gradient_tensor_guid_t gradient_tensor = gradient_tensor_guid_t{5};
+    std::vector<optimizer_tensor_guid_t> optimizer_tensors = {
+        optimizer_tensor_guid_t{8},
+        optimizer_tensor_guid_t{3},
+    };
+
+    TrainingTensorGroupWithAttrs tensor_group_with_attrs =
+        TrainingTensorGroupWithAttrs{
+            /*tensor_attrs=*/tensor_attrs,
+            /*forward_tensor=*/forward_tensor,
+            /*gradient_tensor=*/gradient_tensor,
+            /*optimizer_tensors=*/optimizer_tensors,
+        };
+
+    TrainingTensorGroup result =
+        tensor_group_without_attrs(tensor_group_with_attrs);
+    TrainingTensorGroup correct = TrainingTensorGroup{
+        /*forward_tensor=*/forward_tensor,
+        /*gradient_tensor=*/gradient_tensor,
+        /*optimizer_tensors=*/optimizer_tensors,
+    };
+
+    CHECK(result == correct);
+  }
+}
diff --git a/lib/utils/include/utils/archetypes/ordered_value_type.h b/lib/utils/include/utils/archetypes/ordered_value_type.h
index 5218794fd1..b14f378667 100644
--- a/lib/utils/include/utils/archetypes/ordered_value_type.h
+++ b/lib/utils/include/utils/archetypes/ordered_value_type.h
@@ -39,6 +39,16 @@ struct ordered_value_type {
   }
 };
 
+template <int TAG>
+std::string format_as(ordered_value_type<TAG> const &) {
+  PANIC();
+}
+
+template <int TAG>
+std::ostream &operator<<(std::ostream &s, ordered_value_type<TAG> const &x) {
+  PANIC();
+}
+
 } // namespace FlexFlow
 
 namespace std {
diff --git a/lib/utils/include/utils/containers/all_are_true.h b/lib/utils/include/utils/containers/all_are_true.h
new file mode 100644
index 0000000000..00a4d6016a
--- /dev/null
+++ b/lib/utils/include/utils/containers/all_are_true.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_ALL_ARE_TRUE_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_ALL_ARE_TRUE_H
+
+namespace FlexFlow {
+
+template <typename Container>
+bool all_are_true(Container const &c) {
+  bool result = true;
+  for (bool b : c) {
+    result &= b;
+  }
+  return result;
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/containers/collapse_optionals.h b/lib/utils/include/utils/containers/collapse_optionals.h
new file mode 100644
index 0000000000..9e39e25a57
--- /dev/null
+++ b/lib/utils/include/utils/containers/collapse_optionals.h
@@ -0,0 +1,19 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_COLLAPSE_OPTIONALS_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_COLLAPSE_OPTIONALS_H
+
+#include <optional>
+
+namespace FlexFlow {
+
+template <typename T>
+std::optional<T> collapse_optionals(std::optional<std::optional<T>> const &o) {
+  if (!o.has_value()) {
+    return std::nullopt;
+  }
+
+  return o.value();
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/containers/contains_value.h b/lib/utils/include/utils/containers/contains_value.h
new file mode 100644
index 0000000000..63d21a054a
--- /dev/null
+++ b/lib/utils/include/utils/containers/contains_value.h
@@ -0,0 +1,33 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_CONTAINS_VALUE_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_CONTAINS_VALUE_H
+
+#include <map>
+#include <unordered_map>
+
+namespace FlexFlow {
+
+template <typename K, typename V>
+bool contains_value(std::unordered_map<K, V> const &m, V const &v) {
+  for (auto const &[kk, vv] : m) {
+    if (vv == v) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+template <typename K, typename V>
+bool contains_value(std::map<K, V> const &m, V const &v) {
+  for (auto const &[kk, vv] : m) {
+    if (vv == v) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/containers/filter_keys.h b/lib/utils/include/utils/containers/filter_keys.h
index f240fd2526..0758c48d49 100644
--- a/lib/utils/include/utils/containers/filter_keys.h
+++ b/lib/utils/include/utils/containers/filter_keys.h
@@ -1,6 +1,7 @@
 #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_FILTER_KEYS_H
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_FILTER_KEYS_H
 
+#include <map>
 #include <unordered_map>
 
 namespace FlexFlow {
@@ -17,6 +18,17 @@ std::unordered_map<K, V> filter_keys(std::unordered_map<K, V> const &m,
   return result;
 }
 
+template <typename K, typename V, typename F>
+std::map<K, V> filter_keys(std::map<K, V> const &m, F const &f) {
+  std::map<K, V> result;
+  for (std::pair<K, V> const &kv : m) {
+    if (f(kv.first)) {
+      result.insert(kv);
+    }
+  }
+  return result;
+}
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/utils/include/utils/containers/filtrans.h b/lib/utils/include/utils/containers/filtrans.h
index be1b5093c9..9ee65dee74 100644
--- a/lib/utils/include/utils/containers/filtrans.h
+++ b/lib/utils/include/utils/containers/filtrans.h
@@ -23,7 +23,7 @@ using unwrap_optional_t = typename unwrap_optional<T>::type;
 template <typename F,
           typename In,
           typename Out = unwrap_optional_t<std::invoke_result_t<F, In>>>
-std::vector<Out> filtrans(std::vector<In> const &v, F f) {
+std::vector<Out> filtrans(std::vector<In> const &v, F &&f) {
   std::vector<Out> result;
 
   for (In const &i : v) {
@@ -39,7 +39,7 @@ std::vector<Out> filtrans(std::vector<In> const &v, F f) {
 template <typename F,
           typename In,
           typename Out = unwrap_optional_t<std::invoke_result_t<F, In>>>
-std::unordered_set<Out> filtrans(std::unordered_set<In> const &s, F f) {
+std::unordered_set<Out> filtrans(std::unordered_set<In> const &s, F &&f) {
   std::unordered_set<Out> result;
 
   for (In const &i : s) {
@@ -55,7 +55,7 @@ std::unordered_set<Out> filtrans(std::unordered_set<In> const &s, F f) {
 template <typename F,
           typename In,
           typename Out = unwrap_optional_t<std::invoke_result_t<F, In>>>
-std::set<Out> filtrans(std::set<In> const &s, F f) {
+std::set<Out> filtrans(std::set<In> const &s, F &&f) {
   std::set<Out> result;
 
   for (In const &i : s) {
diff --git a/lib/utils/include/utils/containers/flatmap.h b/lib/utils/include/utils/containers/flatmap.h
index a7848b88aa..eaa8d1dbef 100644
--- a/lib/utils/include/utils/containers/flatmap.h
+++ b/lib/utils/include/utils/containers/flatmap.h
@@ -42,6 +42,17 @@ std::unordered_set<Out> flatmap_v2(std::unordered_set<In> const &v,
   return result;
 }
 
+template <typename In,
+          typename F,
+          typename Out = get_element_type_t<std::invoke_result_t<F, In>>>
+std::set<Out> flatmap(std::set<In> const &v, F const &f) {
+  std::set<Out> result;
+  for (auto const &elem : v) {
+    extend(result, f(elem));
+  }
+  return result;
+}
+
 template <
     typename InK,
     typename InV,
diff --git a/lib/utils/include/utils/exception.h b/lib/utils/include/utils/exception.h
index f95eb8a38d..959edcff8a 100644
--- a/lib/utils/include/utils/exception.h
+++ b/lib/utils/include/utils/exception.h
@@ -31,7 +31,8 @@ T throw_if_unexpected(tl::expected<T, E> const &r) {
   if (r.has_value()) {
     return r.value();
   } else {
-    throw std::runtime_error(fmt::to_string(r.error()));
+    PANIC(fmt::to_string(r.error()));
+    ;
   }
 }
 
diff --git a/lib/utils/include/utils/fmt/half.h b/lib/utils/include/utils/fmt/half.h
new file mode 100644
index 0000000000..9cc1b5c1e7
--- /dev/null
+++ b/lib/utils/include/utils/fmt/half.h
@@ -0,0 +1,26 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_FMT_HALF_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_FMT_HALF_H
+
+#include "utils/half.h"
+#include <fmt/format.h>
+
+namespace fmt {
+
+template <typename Char>
+struct formatter<::half, Char> : formatter<float> {
+  template <typename FormatContext>
+  auto format(::half const &h, FormatContext &ctx) -> decltype(ctx.out()) {
+
+    return formatter<float>::format(h, ctx);
+  }
+};
+
+} // namespace fmt
+
+namespace FlexFlow {
+
+std::ostream &operator<<(std::ostream &, ::half);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/fmt/set.h b/lib/utils/include/utils/fmt/set.h
index c46984cc5a..d619d91500 100644
--- a/lib/utils/include/utils/fmt/set.h
+++ b/lib/utils/include/utils/fmt/set.h
@@ -4,6 +4,7 @@
 #include "utils/check_fmtable.h"
 #include "utils/containers/sorted.h"
 #include "utils/join_strings.h"
+#include "utils/type_traits_core.h"
 #include <fmt/format.h>
 #include <set>
 #include <vector>
@@ -13,7 +14,7 @@ namespace fmt {
 template <typename T, typename Char>
 struct formatter<::std::set<T>,
                  Char,
-                 std::enable_if_t<!detail::has_format_as<std::set<T>>::value>>
+                 std::enable_if_t<!detail::has_format_as<::std::set<T>>::value>>
     : formatter<::std::string> {
   template <typename FormatContext>
   auto format(::std::set<T> const &m, FormatContext &ctx) const
diff --git a/lib/utils/include/utils/fp16.h b/lib/utils/include/utils/half.h
similarity index 100%
rename from lib/utils/include/utils/fp16.h
rename to lib/utils/include/utils/half.h
diff --git a/lib/utils/include/utils/json/half.h b/lib/utils/include/utils/json/half.h
new file mode 100644
index 0000000000..a16d03a3e2
--- /dev/null
+++ b/lib/utils/include/utils/json/half.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_JSON_HALF_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_JSON_HALF_H
+
+#include "utils/half.h"
+#include <nlohmann/json.hpp>
+
+namespace nlohmann {
+
+template <>
+struct adl_serializer<half> {
+  static void to_json(json &j, half x);
+  static void from_json(json const &j, half &t);
+};
+
+} // namespace nlohmann
+
+#endif
diff --git a/lib/utils/include/utils/nonnegative_int/nonnegative_int.h b/lib/utils/include/utils/nonnegative_int/nonnegative_int.h
index a266ddea77..c775cfc9ed 100644
--- a/lib/utils/include/utils/nonnegative_int/nonnegative_int.h
+++ b/lib/utils/include/utils/nonnegative_int/nonnegative_int.h
@@ -14,6 +14,7 @@ class nonnegative_int {
   nonnegative_int() = delete;
   explicit nonnegative_int(int value);
   explicit nonnegative_int(size_t value);
+  explicit nonnegative_int(unsigned long long int value);
 
   explicit operator int() const noexcept;
 
diff --git a/lib/utils/include/utils/nonnegative_int/nonnegative_range.h b/lib/utils/include/utils/nonnegative_int/nonnegative_range.h
index af323aef42..149671e243 100644
--- a/lib/utils/include/utils/nonnegative_int/nonnegative_range.h
+++ b/lib/utils/include/utils/nonnegative_int/nonnegative_range.h
@@ -2,10 +2,12 @@
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_NONNEGATIVE_RANGE_H
 
 #include "utils/nonnegative_int/nonnegative_int.h"
+#include "utils/positive_int/positive_int.h"
 
 namespace FlexFlow {
 
 std::vector<nonnegative_int> nonnegative_range(nonnegative_int end);
+std::vector<nonnegative_int> nonnegative_range(positive_int end);
 std::vector<nonnegative_int>
     nonnegative_range(nonnegative_int start, nonnegative_int end, int step = 1);
 
diff --git a/lib/utils/include/utils/rapidcheck/half.h b/lib/utils/include/utils/rapidcheck/half.h
new file mode 100644
index 0000000000..ffa85ed41f
--- /dev/null
+++ b/lib/utils/include/utils/rapidcheck/half.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_RAPIDCHECK_HALF_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_RAPIDCHECK_HALF_H
+
+#include "utils/half.h"
+#include <rapidcheck.h>
+
+namespace rc {
+
+template <>
+struct Arbitrary<::half> {
+  static Gen<::half> arbitrary();
+};
+
+} // namespace rc
+
+#endif
diff --git a/lib/utils/include/utils/rapidcheck/monostate.h b/lib/utils/include/utils/rapidcheck/monostate.h
new file mode 100644
index 0000000000..b34c069574
--- /dev/null
+++ b/lib/utils/include/utils/rapidcheck/monostate.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_RAPIDCHECK_MONOSTATE_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_RAPIDCHECK_MONOSTATE_H
+
+#include <rapidcheck.h>
+#include <variant>
+
+namespace rc {
+
+template <>
+struct Arbitrary<std::monostate> {
+  static Gen<std::monostate> arbitrary();
+};
+
+} // namespace rc
+
+#endif
diff --git a/lib/utils/include/utils/units/milliseconds_t.h b/lib/utils/include/utils/units/milliseconds_t.h
new file mode 100644
index 0000000000..ed3d5776a3
--- /dev/null
+++ b/lib/utils/include/utils/units/milliseconds_t.h
@@ -0,0 +1,67 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_UNITS_MILLISECONDS_T_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_UNITS_MILLISECONDS_T_H
+
+#include <functional>
+#include <nlohmann/json.hpp>
+#include <rapidcheck.h>
+#include <string>
+
+namespace FlexFlow {
+
+struct milliseconds_t {
+public:
+  milliseconds_t() = delete;
+  explicit milliseconds_t(float);
+
+  bool operator<(milliseconds_t const &other) const;
+  bool operator==(milliseconds_t const &other) const;
+  bool operator>(milliseconds_t const &other) const;
+  bool operator<=(milliseconds_t const &other) const;
+  bool operator!=(milliseconds_t const &other) const;
+  bool operator>=(milliseconds_t const &other) const;
+
+  milliseconds_t operator+(milliseconds_t const &other) const;
+
+  float unwrap_milliseconds() const;
+
+private:
+  float value;
+};
+
+milliseconds_t operator""_ms(long double);
+milliseconds_t operator""_ms(unsigned long long int);
+
+std::ostream &operator<<(std::ostream &, milliseconds_t const &);
+std::string format_as(milliseconds_t const &);
+
+} // namespace FlexFlow
+
+namespace nlohmann {
+
+template <>
+struct adl_serializer<::FlexFlow::milliseconds_t> {
+  static ::FlexFlow::milliseconds_t from_json(json const &j);
+  static void to_json(json &j, ::FlexFlow::milliseconds_t t);
+};
+
+} // namespace nlohmann
+
+namespace rc {
+
+template <>
+struct Arbitrary<::FlexFlow::milliseconds_t> {
+  static Gen<::FlexFlow::milliseconds_t> arbitrary();
+};
+
+} // namespace rc
+
+namespace std {
+
+template <>
+struct hash<::FlexFlow::milliseconds_t> {
+  size_t operator()(::FlexFlow::milliseconds_t const &) const noexcept;
+};
+
+} // namespace std
+
+#endif
diff --git a/lib/utils/include/utils/units/num_bytes_t.h b/lib/utils/include/utils/units/num_bytes_t.h
new file mode 100644
index 0000000000..453cf4c84f
--- /dev/null
+++ b/lib/utils/include/utils/units/num_bytes_t.h
@@ -0,0 +1,62 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_UNITS_NUM_BYTES_T_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_UNITS_NUM_BYTES_T_H
+
+#include "utils/nonnegative_int/nonnegative_int.h"
+namespace FlexFlow {
+
+struct num_bytes_t {
+public:
+  num_bytes_t() = delete;
+  explicit num_bytes_t(nonnegative_int);
+
+  bool operator<(num_bytes_t const &other) const;
+  bool operator==(num_bytes_t const &other) const;
+  bool operator>(num_bytes_t const &other) const;
+  bool operator<=(num_bytes_t const &other) const;
+  bool operator!=(num_bytes_t const &other) const;
+  bool operator>=(num_bytes_t const &other) const;
+
+  num_bytes_t operator+(num_bytes_t const &other) const;
+
+  nonnegative_int unwrap_num_bytes() const;
+
+private:
+  nonnegative_int value;
+};
+
+num_bytes_t operator""_bytes(unsigned long long int);
+
+std::ostream &operator<<(std::ostream &, num_bytes_t const &);
+std::string format_as(num_bytes_t const &);
+
+} // namespace FlexFlow
+
+namespace nlohmann {
+
+template <>
+struct adl_serializer<::FlexFlow::num_bytes_t> {
+  static ::FlexFlow::num_bytes_t from_json(json const &j);
+  static void to_json(json &j, ::FlexFlow::num_bytes_t t);
+};
+
+} // namespace nlohmann
+
+namespace rc {
+
+template <>
+struct Arbitrary<::FlexFlow::num_bytes_t> {
+  static Gen<::FlexFlow::num_bytes_t> arbitrary();
+};
+
+} // namespace rc
+
+namespace std {
+
+template <>
+struct hash<::FlexFlow::num_bytes_t> {
+  size_t operator()(::FlexFlow::num_bytes_t const &) const noexcept;
+};
+
+} // namespace std
+
+#endif
diff --git a/lib/utils/src/fp16.cc b/lib/utils/src/half.cc
similarity index 87%
rename from lib/utils/src/fp16.cc
rename to lib/utils/src/half.cc
index f9dbf486ab..3dbea5c4dc 100644
--- a/lib/utils/src/fp16.cc
+++ b/lib/utils/src/half.cc
@@ -1,4 +1,4 @@
-#include "utils/fp16.h"
+#include "utils/half.h"
 #include "utils/hash-utils.h"
 
 namespace std {
diff --git a/lib/utils/src/utils/containers/all_are_true.cc b/lib/utils/src/utils/containers/all_are_true.cc
new file mode 100644
index 0000000000..5647069f0e
--- /dev/null
+++ b/lib/utils/src/utils/containers/all_are_true.cc
@@ -0,0 +1,10 @@
+#include "utils/containers/all_are_true.h"
+#include <vector>
+
+namespace FlexFlow {
+
+using Container = std::vector<bool>;
+
+template bool all_are_true(Container const &);
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/containers/collapse_optionals.cc b/lib/utils/src/utils/containers/collapse_optionals.cc
new file mode 100644
index 0000000000..b55b16a908
--- /dev/null
+++ b/lib/utils/src/utils/containers/collapse_optionals.cc
@@ -0,0 +1,11 @@
+#include "utils/containers/collapse_optionals.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template std::optional<T>
+    collapse_optionals(std::optional<std::optional<T>> const &);
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/containers/contains_value.cc b/lib/utils/src/utils/containers/contains_value.cc
new file mode 100644
index 0000000000..d9d2118658
--- /dev/null
+++ b/lib/utils/src/utils/containers/contains_value.cc
@@ -0,0 +1,13 @@
+#include "utils/containers/contains_value.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using K = value_type<0>;
+using V = value_type<1>;
+
+template bool contains_value(std::unordered_map<K, V> const &, V const &);
+
+template bool contains_value(std::map<K, V> const &, V const &);
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/containers/filtrans.cc b/lib/utils/src/utils/containers/filtrans.cc
index a57a743ef0..c65a22a669 100644
--- a/lib/utils/src/utils/containers/filtrans.cc
+++ b/lib/utils/src/utils/containers/filtrans.cc
@@ -1 +1,12 @@
 #include "utils/containers/filtrans.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using In = value_type<0>;
+using Out = value_type<1>;
+using F = std::function<std::optional<Out>(In const &)>;
+
+template std::vector<Out> filtrans(std::vector<In> const &, F &&);
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/fmt/half.cc b/lib/utils/src/utils/fmt/half.cc
new file mode 100644
index 0000000000..0075e6e7a7
--- /dev/null
+++ b/lib/utils/src/utils/fmt/half.cc
@@ -0,0 +1,9 @@
+#include "utils/fmt/half.h"
+
+namespace FlexFlow {
+
+std::ostream &operator<<(std::ostream &s, ::half h) {
+  return (s << static_cast<float>(h));
+}
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/fmt/set.cc b/lib/utils/src/utils/fmt/set.cc
index 857367af48..db439414c9 100644
--- a/lib/utils/src/utils/fmt/set.cc
+++ b/lib/utils/src/utils/fmt/set.cc
@@ -1 +1,16 @@
 #include "utils/fmt/set.h"
+#include "utils/archetypes/ordered_value_type.h"
+
+using T = ::FlexFlow::ordered_value_type<0>;
+
+namespace fmt {
+
+template struct formatter<::std::set<T>, char>;
+
+}
+
+namespace FlexFlow {
+
+template std::ostream &operator<<(std::ostream &, std::set<T> const &);
+
+}
diff --git a/lib/utils/src/utils/json/half.cc b/lib/utils/src/utils/json/half.cc
new file mode 100644
index 0000000000..6555de13c5
--- /dev/null
+++ b/lib/utils/src/utils/json/half.cc
@@ -0,0 +1,13 @@
+#include "utils/json/half.h"
+
+namespace nlohmann {
+
+void adl_serializer<half>::to_json(json &j, half x) {
+  j = static_cast<float>(x);
+}
+
+void adl_serializer<half>::from_json(json const &j, half &x) {
+  x = j.get<float>();
+}
+
+} // namespace nlohmann
diff --git a/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc b/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc
index 3472a7eee2..7593a8e9ec 100644
--- a/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc
+++ b/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc
@@ -4,20 +4,20 @@
 namespace FlexFlow {
 
 nonnegative_int::nonnegative_int(int value) {
-  if (value < 0) {
-    throw std::invalid_argument(
-        "Value of nonnegative_int type must be nonnegative.");
-  }
+  ASSERT(value >= 0, "Value of nonnegative_int must be nonnegative");
   this->value_ = value;
 }
 
 nonnegative_int::nonnegative_int(size_t value) {
-  if (value > std::numeric_limits<int>::max()) {
-    throw std::invalid_argument(fmt::format(
-        "Input {} to nonnegative_int(size_t) is out-of-bounds for int", value));
-  }
+  ASSERT(value <= std::numeric_limits<int>::max());
+  this->value_ = static_cast<int>(value);
+  ASSERT(this->value_ >= 0, "Value of nonnegative_int must be nonnegative");
+}
+
+nonnegative_int::nonnegative_int(unsigned long long int value) {
+  ASSERT(value <= std::numeric_limits<int>::max());
   this->value_ = static_cast<int>(value);
-  assert(this->value_ >= 0);
+  ASSERT(this->value_ >= 0, "Value of nonnegative_int must be nonnegative");
 }
 
 nonnegative_int::operator int() const noexcept {
@@ -27,18 +27,23 @@ nonnegative_int::operator int() const noexcept {
 bool nonnegative_int::operator<(nonnegative_int const &other) const {
   return this->value_ < other.value_;
 }
+
 bool nonnegative_int::operator==(nonnegative_int const &other) const {
   return this->value_ == other.value_;
 }
+
 bool nonnegative_int::operator>(nonnegative_int const &other) const {
   return this->value_ > other.value_;
 }
+
 bool nonnegative_int::operator<=(nonnegative_int const &other) const {
   return this->value_ <= other.value_;
 }
+
 bool nonnegative_int::operator!=(nonnegative_int const &other) const {
   return this->value_ != other.value_;
 }
+
 bool nonnegative_int::operator>=(nonnegative_int const &other) const {
   return this->value_ >= other.value_;
 }
@@ -46,18 +51,23 @@ bool nonnegative_int::operator>=(nonnegative_int const &other) const {
 bool nonnegative_int::operator<(int const &other) const {
   return this->value_ < other;
 }
+
 bool nonnegative_int::operator==(int const &other) const {
   return this->value_ == other;
 }
+
 bool nonnegative_int::operator>(int const &other) const {
   return this->value_ > other;
 }
+
 bool nonnegative_int::operator<=(int const &other) const {
   return this->value_ <= other;
 }
+
 bool nonnegative_int::operator!=(int const &other) const {
   return this->value_ != other;
 }
+
 bool nonnegative_int::operator>=(int const &other) const {
   return this->value_ >= other;
 }
@@ -65,18 +75,23 @@ bool nonnegative_int::operator>=(int const &other) const {
 bool operator<(int const &lhs, nonnegative_int const &rhs) {
   return lhs < rhs.value_;
 }
+
 bool operator==(int const &lhs, nonnegative_int const &rhs) {
   return lhs == rhs.value_;
 }
+
 bool operator>(int const &lhs, nonnegative_int const &rhs) {
   return lhs > rhs.value_;
 }
+
 bool operator<=(int const &lhs, nonnegative_int const &rhs) {
   return lhs <= rhs.value_;
 }
+
 bool operator!=(int const &lhs, nonnegative_int const &rhs) {
   return lhs != rhs.value_;
 }
+
 bool operator>=(int const &lhs, nonnegative_int const &rhs) {
   return lhs >= rhs.value_;
 }
diff --git a/lib/utils/src/utils/nonnegative_int/nonnegative_range.cc b/lib/utils/src/utils/nonnegative_int/nonnegative_range.cc
index f31db6d589..8195759388 100644
--- a/lib/utils/src/utils/nonnegative_int/nonnegative_range.cc
+++ b/lib/utils/src/utils/nonnegative_int/nonnegative_range.cc
@@ -9,6 +9,10 @@ std::vector<nonnegative_int> nonnegative_range(nonnegative_int end) {
                    [](int x) { return nonnegative_int{x}; });
 }
 
+std::vector<nonnegative_int> nonnegative_range(positive_int end) {
+  return nonnegative_range(end.nonnegative_int_from_positive_int());
+}
+
 std::vector<nonnegative_int>
     nonnegative_range(nonnegative_int start, nonnegative_int end, int step) {
   return transform(
diff --git a/lib/utils/src/utils/rapidcheck/half.cc b/lib/utils/src/utils/rapidcheck/half.cc
new file mode 100644
index 0000000000..80d009364a
--- /dev/null
+++ b/lib/utils/src/utils/rapidcheck/half.cc
@@ -0,0 +1,9 @@
+#include "utils/rapidcheck/half.h"
+
+namespace rc {
+
+Gen<::half> Arbitrary<::half>::arbitrary() {
+  return gen::construct<::half>(gen::arbitrary<float>());
+}
+
+} // namespace rc
diff --git a/lib/utils/src/utils/rapidcheck/monostate.cc b/lib/utils/src/utils/rapidcheck/monostate.cc
new file mode 100644
index 0000000000..96c72373aa
--- /dev/null
+++ b/lib/utils/src/utils/rapidcheck/monostate.cc
@@ -0,0 +1,9 @@
+#include "utils/rapidcheck/monostate.h"
+
+namespace rc {
+
+Gen<std::monostate> Arbitrary<std::monostate>::arbitrary() {
+  return gen::construct<std::monostate>();
+}
+
+} // namespace rc
diff --git a/lib/utils/src/utils/units/milliseconds_t.cc b/lib/utils/src/utils/units/milliseconds_t.cc
new file mode 100644
index 0000000000..fb0dd01d64
--- /dev/null
+++ b/lib/utils/src/utils/units/milliseconds_t.cc
@@ -0,0 +1,94 @@
+#include "utils/units/milliseconds_t.h"
+#include "utils/hash-utils.h"
+#include <fmt/format.h>
+#include <libassert/assert.hpp>
+#include <limits>
+
+namespace FlexFlow {
+
+milliseconds_t::milliseconds_t(float value) : value(value) {}
+
+bool milliseconds_t::operator<(milliseconds_t const &other) const {
+  return this->value < other.value;
+}
+
+bool milliseconds_t::operator==(milliseconds_t const &other) const {
+  return this->value == other.value;
+}
+
+bool milliseconds_t::operator>(milliseconds_t const &other) const {
+  return this->value > other.value;
+}
+
+bool milliseconds_t::operator<=(milliseconds_t const &other) const {
+  return this->value <= other.value;
+}
+
+bool milliseconds_t::operator!=(milliseconds_t const &other) const {
+  return this->value != other.value;
+}
+
+bool milliseconds_t::operator>=(milliseconds_t const &other) const {
+  return this->value >= other.value;
+}
+
+milliseconds_t milliseconds_t::operator+(milliseconds_t const &other) const {
+  return milliseconds_t{
+      this->value + other.value,
+  };
+}
+
+float milliseconds_t::unwrap_milliseconds() const {
+  return this->value;
+}
+
+milliseconds_t operator""_ms(long double x) {
+  ASSERT(x <= std::numeric_limits<float>::max());
+  ASSERT(x >= std::numeric_limits<float>::lowest());
+  return milliseconds_t{static_cast<float>(x)};
+}
+
+milliseconds_t operator""_ms(unsigned long long int x) {
+  ASSERT(x <= std::numeric_limits<float>::max());
+  return milliseconds_t{static_cast<float>(x)};
+}
+
+std::ostream &operator<<(std::ostream &s, milliseconds_t const &m) {
+  return (s << fmt::to_string(m));
+}
+
+std::string format_as(milliseconds_t const &m) {
+  return fmt::format("{}_ms", m.unwrap_milliseconds());
+}
+
+} // namespace FlexFlow
+
+namespace nlohmann {
+::FlexFlow::milliseconds_t
+    adl_serializer<::FlexFlow::milliseconds_t>::from_json(json const &j) {
+  return ::FlexFlow::milliseconds_t{j.template get<float>()};
+}
+
+void adl_serializer<::FlexFlow::milliseconds_t>::to_json(
+    json &j, ::FlexFlow::milliseconds_t t) {
+  j = t.unwrap_milliseconds();
+}
+} // namespace nlohmann
+
+namespace rc {
+
+Gen<::FlexFlow::milliseconds_t>
+    Arbitrary<::FlexFlow::milliseconds_t>::arbitrary() {
+  return gen::construct<::FlexFlow::milliseconds_t>(gen::arbitrary<float>());
+}
+
+} // namespace rc
+
+namespace std {
+
+size_t hash<::FlexFlow::milliseconds_t>::operator()(
+    ::FlexFlow::milliseconds_t const &m) const noexcept {
+  return ::FlexFlow::get_std_hash(m.unwrap_milliseconds());
+}
+
+} // namespace std
diff --git a/lib/utils/src/utils/units/num_bytes_t.cc b/lib/utils/src/utils/units/num_bytes_t.cc
new file mode 100644
index 0000000000..f845d0a91b
--- /dev/null
+++ b/lib/utils/src/utils/units/num_bytes_t.cc
@@ -0,0 +1,87 @@
+#include "utils/units/num_bytes_t.h"
+#include "utils/hash-utils.h"
+#include <fmt/format.h>
+#include <libassert/assert.hpp>
+#include <limits>
+
+namespace FlexFlow {
+
+num_bytes_t::num_bytes_t(nonnegative_int value) : value(value) {}
+
+bool num_bytes_t::operator<(num_bytes_t const &other) const {
+  return this->value < other.value;
+}
+
+bool num_bytes_t::operator==(num_bytes_t const &other) const {
+  return this->value == other.value;
+}
+
+bool num_bytes_t::operator>(num_bytes_t const &other) const {
+  return this->value > other.value;
+}
+
+bool num_bytes_t::operator<=(num_bytes_t const &other) const {
+  return this->value <= other.value;
+}
+
+bool num_bytes_t::operator!=(num_bytes_t const &other) const {
+  return this->value != other.value;
+}
+
+bool num_bytes_t::operator>=(num_bytes_t const &other) const {
+  return this->value >= other.value;
+}
+
+num_bytes_t num_bytes_t::operator+(num_bytes_t const &other) const {
+  return num_bytes_t{
+      this->value + other.value,
+  };
+}
+
+nonnegative_int num_bytes_t::unwrap_num_bytes() const {
+  return this->value;
+}
+
+num_bytes_t operator""_bytes(unsigned long long int x) {
+  return num_bytes_t{nonnegative_int{x}};
+}
+
+std::ostream &operator<<(std::ostream &s, num_bytes_t const &m) {
+  return (s << fmt::to_string(m));
+}
+
+std::string format_as(num_bytes_t const &m) {
+  return fmt::format("{}_bytes", m.unwrap_num_bytes());
+}
+
+} // namespace FlexFlow
+
+namespace nlohmann {
+::FlexFlow::num_bytes_t
+    adl_serializer<::FlexFlow::num_bytes_t>::from_json(json const &j) {
+  return ::FlexFlow::num_bytes_t{j.template get<::FlexFlow::nonnegative_int>()};
+}
+
+void adl_serializer<::FlexFlow::num_bytes_t>::to_json(
+    json &j, ::FlexFlow::num_bytes_t t) {
+  j = t.unwrap_num_bytes();
+}
+} // namespace nlohmann
+
+namespace rc {
+
+Gen<::FlexFlow::num_bytes_t> Arbitrary<::FlexFlow::num_bytes_t>::arbitrary() {
+  return gen::construct<::FlexFlow::num_bytes_t>(
+      gen::arbitrary<::FlexFlow::nonnegative_int>());
+}
+
+} // namespace rc
+
+namespace std {
+
+size_t hash<::FlexFlow::num_bytes_t>::operator()(
+    ::FlexFlow::num_bytes_t const &m) const noexcept {
+  return ::FlexFlow::get_std_hash(m.unwrap_num_bytes());
+}
+
+} // namespace std
diff --git a/lib/utils/test/common/include/test/utils/doctest/fmt/half.h b/lib/utils/test/common/include/test/utils/doctest/fmt/half.h
new file mode 100644
index 0000000000..f3694bb981
--- /dev/null
+++ b/lib/utils/test/common/include/test/utils/doctest/fmt/half.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_UTILS_TEST_COMMON_INCLUDE_TEST_UTILS_DOCTEST_FMT_HALF_H
+#define _FLEXFLOW_LIB_UTILS_TEST_COMMON_INCLUDE_TEST_UTILS_DOCTEST_FMT_HALF_H
+
+#include "utils/half.h"
+#include <doctest/doctest.h>
+
+namespace doctest {
+
+template <>
+struct StringMaker<::half> {
+  static String convert(::half const &);
+};
+
+} // namespace doctest
+
+#endif
diff --git a/lib/utils/test/common/src/main.cc b/lib/utils/test/common/src/main.cc
index 6df2d925b7..9c72e6310c 100644
--- a/lib/utils/test/common/src/main.cc
+++ b/lib/utils/test/common/src/main.cc
@@ -1,6 +1,7 @@
 #define DOCTEST_CONFIG_IMPLEMENT
 #include <doctest/doctest.h>
 
+#include <cpptrace/cpptrace.hpp>
 #include <libassert/assert.hpp>
 #include <stdexcept>
 
@@ -10,6 +11,7 @@ void libassert_throw_exception_handler(libassert::assertion_info const &info) {
 
 int main(int argc, char **argv) {
   libassert::set_failure_handler(libassert_throw_exception_handler);
+  cpptrace::register_terminate_handler();
 
   return doctest::Context(argc, argv).run();
 }
diff --git a/lib/utils/test/common/src/test/utils/doctest/fmt/half.cc b/lib/utils/test/common/src/test/utils/doctest/fmt/half.cc
new file mode 100644
index 0000000000..c2e8124678
--- /dev/null
+++ b/lib/utils/test/common/src/test/utils/doctest/fmt/half.cc
@@ -0,0 +1,9 @@
+#include "test/utils/doctest/fmt/half.h"
+
+namespace doctest {
+
+String StringMaker<::half>::convert(::half const &h) {
+  return toString(static_cast<float>(h));
+}
+
+} // namespace doctest
diff --git a/lib/utils/test/src/utils/containers/all_are_true.cc b/lib/utils/test/src/utils/containers/all_are_true.cc
new file mode 100644
index 0000000000..3a725a7d00
--- /dev/null
+++ b/lib/utils/test/src/utils/containers/all_are_true.cc
@@ -0,0 +1,36 @@
+#include "utils/containers/all_are_true.h"
+#include <doctest/doctest.h>
+#include <vector>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("all_are_true") {
+    SUBCASE("all elements are true") {
+      std::vector<bool> input = {true, true, true};
+
+      bool result = all_are_true(input);
+      bool correct = true;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("not all elements are true") {
+      std::vector<bool> input = {true, false, true, false};
+
+      bool result = all_are_true(input);
+      bool correct = false;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("empty input vector") {
+      std::vector<bool> input = {};
+
+      bool result = all_are_true(input);
+      bool correct = true;
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/utils/test/src/utils/containers/collapse_optionals.cc b/lib/utils/test/src/utils/containers/collapse_optionals.cc
new file mode 100644
index 0000000000..201b1bdf02
--- /dev/null
+++ b/lib/utils/test/src/utils/containers/collapse_optionals.cc
@@ -0,0 +1,38 @@
+#include "utils/containers/collapse_optionals.h"
+#include "test/utils/doctest/fmt/optional.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("collapse_optionals(std::optional<std::optional<T>>)") {
+    SUBCASE("returns the value if the input has value") {
+      std::optional<std::optional<int>> input = 8;
+
+      std::optional<int> result = collapse_optionals(input);
+      std::optional<int> correct = 8;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("returns nullopt if the input is wrapped nullopt") {
+      std::optional<std::optional<int>> input =
+          std::optional<int>{std::nullopt};
+
+      std::optional<int> result = collapse_optionals(input);
+      std::optional<int> correct = std::nullopt;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("returns nullopt if the input is unwrapped nullopt") {
+      std::optional<std::optional<int>> input =
+          std::optional<std::optional<int>>{std::nullopt};
+
+      std::optional<int> result = collapse_optionals(input);
+      std::optional<int> correct = std::nullopt;
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/utils/test/src/utils/containers/contains_value.cc b/lib/utils/test/src/utils/containers/contains_value.cc
new file mode 100644
index 0000000000..136ef3b304
--- /dev/null
+++ b/lib/utils/test/src/utils/containers/contains_value.cc
@@ -0,0 +1,51 @@
+#include "utils/containers/contains_value.h"
+#include <doctest/doctest.h>
+#include <string>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("contains_value(std::unordered_map<K, V>, V)") {
+    std::unordered_map<int, std::string> m = {
+        {1, "one"},
+        {3, "three"},
+        {4, "three"},
+    };
+
+    SUBCASE("returns true if the value is in the map") {
+      bool result = contains_value(m, std::string{"one"});
+      bool correct = true;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("returns false if the value is not in the map") {
+      bool result = contains_value(m, std::string{"two"});
+      bool correct = false;
+
+      CHECK(result == correct);
+    }
+  }
+
+  TEST_CASE("contains_value(std::map<K, V>, V)") {
+    std::map<int, std::string> m = {
+        {1, "one"},
+        {3, "three"},
+        {4, "three"},
+    };
+
+    SUBCASE("returns true if the value is in the map") {
+      bool result = contains_value(m, std::string{"one"});
+      bool correct = true;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("returns false if the value is not in the map") {
+      bool result = contains_value(m, std::string{"two"});
+      bool correct = false;
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/utils/test/src/utils/positive_int/positive_int.cc b/lib/utils/test/src/utils/positive_int/positive_int.cc
index 77ecbf854d..88454bbfbd 100644
--- a/lib/utils/test/src/utils/positive_int/positive_int.cc
+++ b/lib/utils/test/src/utils/positive_int/positive_int.cc
@@ -1,6 +1,6 @@
 #include "utils/positive_int/positive_int.h"
-#include <doctest/doctest.h>
 #include "test/utils/rapidcheck.h"
+#include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
@@ -51,16 +51,16 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("_p notation for positive_int") {
-    CHECK(9_p == positive_int{9}); 
+    CHECK(9_p == positive_int{9});
     CHECK_THROWS(0_p);
   }
 
   TEST_CASE("static_cast<int>(positive_int)") {
-    CHECK(static_cast<int>(8_p) == 8);   
+    CHECK(static_cast<int>(8_p) == 8);
   }
 
   TEST_CASE("static_cast<nonnegative_int>(positive_int)") {
-    CHECK(static_cast<nonnegative_int>(6_p) == 6);   
+    CHECK(static_cast<nonnegative_int>(6_p) == 6);
   }
 
   TEST_CASE("positive_int < positive_int") {
@@ -321,7 +321,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("positive_int * positive_int") {
-    CHECK(3_p * 4_p == 12_p); 
+    CHECK(3_p * 4_p == 12_p);
   }
 
   TEST_CASE("positive_int *= positive_int") {
@@ -339,7 +339,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("positive_int * nonnegative_int") {
-    CHECK(3_p * 4_n == 12_n); 
+    CHECK(3_p * 4_n == 12_n);
     CHECK(3_p * 0_n == 0_n);
   }
 
@@ -360,10 +360,10 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("float / positive_int") {
-    CHECK(4.0f / 2_p == 2.0f);   
-    CHECK(3.0f / 2_p == 1.5f);   
-    CHECK(-3.0f / 4_p == -0.75f);   
-    CHECK(0.0f / 1_p == 0.0f);   
+    CHECK(4.0f / 2_p == 2.0f);
+    CHECK(3.0f / 2_p == 1.5f);
+    CHECK(-3.0f / 4_p == -0.75f);
+    CHECK(0.0f / 1_p == 0.0f);
   }
 
   TEST_CASE("float /= positive_int") {
@@ -411,11 +411,11 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("positive_int::int_from_positive_int()") {
-    CHECK((3_p).int_from_positive_int() == 3); 
+    CHECK((3_p).int_from_positive_int() == 3);
   }
 
   TEST_CASE("positive_int::nonnegative_int_from_positive_int()") {
-    CHECK((4_p).nonnegative_int_from_positive_int() == 4); 
+    CHECK((4_p).nonnegative_int_from_positive_int() == 4);
   }
 
   TEST_CASE("positive_int::operator<<(std::ostream &, positive_int)") {
@@ -443,7 +443,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       nlohmann::json correct = 5;
 
       CHECK(result == correct);
-    } 
+    }
 
     SUBCASE("from_json") {
       nlohmann::json input = 5;
@@ -480,6 +480,6 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("rc::Arbitrary<positive_int>") {
-    RC_SUBCASE([](positive_int) { });
+    RC_SUBCASE([](positive_int) {});
   }
 }

From 426206e90d63f3ab2d6411a1c1a300883b9cd472 Mon Sep 17 00:00:00 2001
From: fruitea <aetiurf@gmail.com>
Date: Tue, 5 Aug 2025 20:42:59 -0700
Subject: [PATCH 90/91] fix: e2e test for realm backend

---
 .../src/realm_training_backing.cc             |   4 +-
 lib/realm-backend/test/src/test_e2e.cc        |   2 +-
 lib/realm-backend/test/src/test_update.cc     | 124 ------------------
 3 files changed, 4 insertions(+), 126 deletions(-)
 delete mode 100644 lib/realm-backend/test/src/test_update.cc

diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc
index b436443cdb..66bc098e07 100644
--- a/lib/realm-backend/src/realm_training_backing.cc
+++ b/lib/realm-backend/src/realm_training_backing.cc
@@ -22,7 +22,7 @@ namespace FlexFlow {
 
 using namespace Realm;
 
-LocalTrainingBacking make_local_training_backing_for_computation_graph(
+LocalTrainingBacking make_realm_training_backing_for_computation_graph(
     RealmRuntimeState &runtime_state,
     std::unordered_map<training_tensor_guid_t, GenericTensorAccessorW> const
         &preallocated,
@@ -267,6 +267,8 @@ Future<void> execute_update(LocalTrainingBacking const &local_training_backing,
     runtime_state.worker_events[0] = e;
     future.set_event(e);
     return future;
+  } else {
+    return Future<void>();
   }
 }
 
diff --git a/lib/realm-backend/test/src/test_e2e.cc b/lib/realm-backend/test/src/test_e2e.cc
index 66ff034240..f7a338d32b 100644
--- a/lib/realm-backend/test/src/test_e2e.cc
+++ b/lib/realm-backend/test/src/test_e2e.cc
@@ -194,6 +194,6 @@ void top_level_task(const void *args, size_t arglen, const void *userdata,
         GenericTensorAccessorR first_epoch_loss = loss_values.at(0);
         GenericTensorAccessorR last_epoch = loss_values.back();
         assert(did_loss_decrease(first_epoch_loss, last_epoch));
-        printf("passed\n");
+        printf("passed!\n");
     }
 }
diff --git a/lib/realm-backend/test/src/test_update.cc b/lib/realm-backend/test/src/test_update.cc
deleted file mode 100644
index cd7119271d..0000000000
--- a/lib/realm-backend/test/src/test_update.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-#include "kernels/managed_ff_stream.h"
-#include "kernels/managed_per_device_ff_handle.h"
-#include "local-execution/allocated_tensors.h"
-#include "pcg/computation_graph.h"
-#include "pcg/computation_graph_builder.h"
-#include "pcg/optimizer_attrs.dtg.h"
-#include "realm-backend/driver.h"
-#include "realm-backend/realm_allocator.h"
-#include "realm-backend/local_training_backing.h"
-#include "test_utils.h"
-
-using namespace ::FlexFlow;
-using namespace Realm;
-
-void top_level_task(const void *args, size_t arglen, const void *userdata,
-                    size_t userlen, Realm::Processor p) {
-  // initialize runtime configs
-  ManagedFFStream managed_stream{};
-  ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle();
-  std::vector<Processor> worker_procs;
-  std::vector<Allocator> allocators;
-  Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine())
-                                   .only_kind(Processor::TOC_PROC);
-  assert(pq.count() > 0);
-  for (Processor p : pq) {
-    worker_procs.push_back(p);
-    allocators.push_back(create_realm_memory_allocator(p));
-  }
-
-  AllocatedTensors allocated_tensors = make_empty_allocated_tensors();
-
-  // construct computation graph
-  ComputationGraph computation_graph = make_empty_computation_graph();
-
-  nonnegative_int batch_size = 10_n;
-  nonnegative_int data_dim = 16_n;
-  nonnegative_int output_dim = 32_n;
-
-  TensorShape input_tensor_shape =
-      TensorShape{TensorDims{FFOrdered<nonnegative_int>{batch_size, data_dim}},
-                  DataType::FLOAT};
-
-  TensorShape weight_shape =
-      TensorShape{TensorDims{FFOrdered<nonnegative_int>{data_dim, output_dim}},
-                  DataType::FLOAT};
-
-  LayerAddedResult inputs_layer =
-      add_input_layer(computation_graph, input_tensor_shape);
-
-  LayerAddedResult weights_layer = add_layer(
-      computation_graph,
-      LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{
-                     weight_shape, InitializerAttrs{ZeroInitializerAttrs{}}}},
-                 "weights"},
-      {}, {});
-
-  LayerAddedResult linear_operator =
-      add_layer(computation_graph,
-                LayerAttrs{ComputationGraphOpAttrs{
-                               LinearAttrs{output_dim,
-                                           /*use_bias=*/false, DataType::FLOAT,
-                                           Activation::RELU, std::nullopt}},
-                           "linear"},
-                inputs_layer.outputs, weights_layer.outputs);
-
-  RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{
-      DeviceSpecific<PerDeviceFFHandle>::create(managed_handle.raw_handle()),
-      EnableProfiling::YES,
-      ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}};
-
-  GradientTensorSource gradient_tensor_source;
-  OptimizerTensorSource optimizer_tensor_source;
-
-  int test_id = 0;
-
-  {
-    printf("\nRunning test %d: SGDOptimizerAttrs, momentum=0...\n", ++test_id);
-    OptimizerAttrs optimizer_attrs =
-        OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
-                                        /*momentum=*/0.0f,
-                                        /*nesterov=*/false,
-                                        /*weight_decay=*/0.001}};
-    LocalTrainingBacking local_training_backing = LocalTrainingBacking(
-        p, worker_procs, allocators, allocated_tensors, gradient_tensor_source,
-        optimizer_tensor_source, computation_graph, runtime_arg_config,
-        optimizer_attrs);
-    execute_update(local_training_backing, linear_operator.layer, optimizer_attrs).wait();
-    printf("passed\n");
-  }
-
-  {
-    printf("\nRunning test %d: SGDOptimizerAttrs, momentum=0.9...\n", ++test_id);
-    OptimizerAttrs optimizer_attrs =
-        OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001,
-                                        /*momentum=*/0.9,
-                                        /*nesterov=*/false,
-                                        /*weight_decay=*/0.001}};
-    LocalTrainingBacking local_training_backing = LocalTrainingBacking(
-        p, worker_procs, allocators, allocated_tensors, gradient_tensor_source,
-        optimizer_tensor_source, computation_graph, runtime_arg_config,
-        optimizer_attrs);
-    execute_update(local_training_backing, linear_operator.layer, optimizer_attrs).wait();
-    printf("passed\n");
-  }
-  
-  {
-    printf("\nRunning test %d: AdamOptimizerAttrs...\n", ++test_id);
-    OptimizerAttrs optimizer_attrs =
-        OptimizerAttrs{AdamOptimizerAttrs{/*alpha=*/0.001,
-                                        /*beta1=*/0.9,
-                                        /*beta2=*/0.999,
-                                        /*weight_decay=*/0.001,
-                                        /*alpha_t=*/0.001,
-                                        /*beta_t=*/0.9,
-                                        /*beta2_t=*/0.999,
-                                        /*epsilon=*/1e-8}};
-    LocalTrainingBacking local_training_backing = LocalTrainingBacking(
-        p, worker_procs, allocators, allocated_tensors, gradient_tensor_source,
-        optimizer_tensor_source, computation_graph, runtime_arg_config,
-        optimizer_attrs);
-    execute_update(local_training_backing, linear_operator.layer, optimizer_attrs).wait();
-    printf("passed\n");
-  }
-}

From b21edc6c785bd770787259f11a65a2a7662f51a8 Mon Sep 17 00:00:00 2001
From: Zhuofu Chen <59316330+chenzhuofu@users.noreply.github.com>
Date: Fri, 5 Dec 2025 14:32:26 -0500
Subject: [PATCH 91/91] Delete lib/realm-backend/src/realm_training_backing
 copy.cc

---
 .../src/realm_training_backing copy.cc        | 126 ------------------
 1 file changed, 126 deletions(-)
 delete mode 100644 lib/realm-backend/src/realm_training_backing copy.cc

diff --git a/lib/realm-backend/src/realm_training_backing copy.cc b/lib/realm-backend/src/realm_training_backing copy.cc
deleted file mode 100644
index e6a3079a25..0000000000
--- a/lib/realm-backend/src/realm_training_backing copy.cc	
+++ /dev/null
@@ -1,126 +0,0 @@
-// #include "kernels/allocation.h"
-// #include "local-execution/loss_functions.h"
-// #include "local-execution/optimizer.h"
-// #include "pcg/computation_graph.dtg.h"
-// #include "pcg/computation_graph.h"
-// #include "pcg/optimizer_attrs.h"
-// #include "realm-backend/realm_tensor_backing.h"
-// #include "task-spec/op_task_to_task_invocation.h"
-// #include "task-spec/runtime_arg_config.h"
-// #include "task-spec/task_invocation.h"
-// #include "task-spec/task_signature_impl.h"
-// #include "utils/containers/contains.h"
-// #include "utils/containers/contains_key.h"
-// #include "utils/containers/get_only.h"
-// #include "utils/containers/values.h"
-// #include "utils/exception.h"
-
-// #include "realm-backend/realm_training_backing.h"
-// #include "realm-backend/task_result.h"
-// #include "realm-backend/task_wrapper.h"
-
-// namespace FlexFlow {
-
-// using namespace Realm;
-
-// RealmTrainingBacking::RealmTrainingBacking(
-//     Processor master_proc, std::vector<Processor> const &worker_procs,
-//     std::vector<Allocator> const &allocators,
-//     AllocatedTensors const &allocated_tensors,
-//     GradientTensorSource &gradient_tensor_source,
-//     ComputationGraph const &computation_graph,
-//     RuntimeArgConfig const &runtime_arg_config)
-//     : master_proc(master_proc), master_event(Realm::Event::NO_EVENT),
-//       master_mem(Machine::MemoryQuery(Machine::get_machine())
-//                      .only_kind(Memory::SYSTEM_MEM)
-//                      .best_affinity_to(master_proc)
-//                      .first()),
-//     worker_procs(worker_procs),
-//     worker_events(std::vector<Realm::Event>(worker_procs.size(),
-//                                            Realm::Event::NO_EVENT)),
-//       allocators(allocators), computation_graph(computation_graph),
-//       task_registry(construct_task_registry_and_register_tasks_for_realm(
-//           computation_graph, worker_procs)),
-//       realm_tensor_backing(construct_realm_tensor_backing( // TODO: multi gpu
-//         allocated_tensors,
-//         generate_unallocated_tensors(
-//             allocated_tensors, get_all_tensor_attrs(computation_graph),
-//             gradient_tensor_source),
-//         this->allocators[0])),
-//       realm_args_backing(initialize_args_backing(this, computation_graph, runtime_arg_config)) {}
-
-// TaskRegistry construct_task_registry_and_register_tasks_for_realm(
-//     ComputationGraph const &cg, std::vector<Realm::Processor> const &worker_procs) {
-//   TaskRegistry task_registry = construct_task_registry(
-//     get_layer_attrs_mapping(cg));
-
-//   // register tasks for realm
-//   std::unordered_map<layer_guid_t, LayerAttrs> const &layer_attrs_mapping =
-//       get_layer_attrs_mapping(cg);
-//   for (std::pair<layer_guid_t, LayerAttrs> const &layer_attrs :
-//       layer_attrs_mapping) {
-//     ComputationGraphOpAttrs attrs = layer_attrs.second.op_attrs;
-//     std::vector<task_id_t> task_ids = get_task_ids(attrs);
-//     for (task_id_t task_id : task_ids) {
-//         TaskSignatureAndImpl task_signature_impl = get_task_sig_impl(task_id);
-//         // TODO: multi gpu
-//         register_wrapper_tasks(0, worker_procs[0], task_id, task_signature_impl);
-//     }
-//   }
-
-//   return task_registry;
-// }
-
-// RealmArgsBacking
-// initialize_args_backing(RealmTrainingBacking *backing,
-//                         ComputationGraph const &cg,
-//                         RuntimeArgConfig const &runtime_arg_config) {
-//   std::unordered_map<layer_guid_t, DeviceSpecificDeviceStates>
-//       per_device_op_states;
-//   TaskRegistry const &task_registry = backing->task_registry;
-//   RealmTensorBacking const &realm_tensor_backing =
-//       backing->realm_tensor_backing;
-//   Processor master_proc = backing->master_proc;
-//   Memory master_mem = backing->master_mem;
-//   std::vector<Processor> &worker_procs = backing->worker_procs;
-//   std::vector<Event> &worker_events = backing->worker_events;
-//   // TODO: multi gpu
-//   Allocator &allocator = backing->allocators[0];
-
-//   for (layer_guid_t const &node : topological_ordering(cg)) {
-//     if (registry_contains_task_for_layer(task_registry, node,
-//                                          OpTaskType::INIT)) {
-//       ComputationGraphOpAttrs attrs = get_layer_attrs(cg, node).op_attrs;
-
-//       TaskInvocation invocation = lower_to_task_invocation(
-//           init(attrs), node, get_incoming_inputs(cg, node),
-//           get_incoming_input_shapes(cg, node), get_outgoing_tensors(cg, node),
-//           get_incoming_weights(cg, node),
-//           realm_tensor_backing.tensor_gradient_mapping, std::nullopt);
-//       TaskArgumentAccessor accessor = get_task_arg_accessor(
-//           realm_tensor_backing,
-//           make_args_backing_with_empty_device_states(runtime_arg_config),
-//           invocation,
-//           allocator);
-//       task_id_t task_id = invocation.task_id;
-//       TaskImplFunction impl_function =
-//           task_registry.task_mapping.at(task_id).impl_function;
-//       // TODO: multi gpu launching
-//       Promise<DeviceSpecificDeviceStates> promise = Promise<DeviceSpecificDeviceStates>();
-//       Future<DeviceSpecificDeviceStates> future = promise.get_future();
-//       RealmTaskArgs<DeviceSpecificDeviceStates>* task_arg = new RealmTaskArgs<DeviceSpecificDeviceStates>{
-//           task_id, impl_function, accessor, std::move(promise)};
-//       uintptr_t args[1] = {reinterpret_cast<uintptr_t>(task_arg)};
-//       Event e =
-//           worker_procs[0].spawn(get_realm_task_id(task_id),
-//                                 args, sizeof(uintptr_t), worker_events[0]);
-//       worker_events[0] = e;
-//       future.set_event(e);
-//       per_device_op_states.insert({node, future.get().value()});
-//     }
-//   }
-
-//   return RealmArgsBacking{runtime_arg_config, per_device_op_states};
-// }
-
-// } // namespace FlexFlow